In [35]:
import pandas as pd 
import numpy as np

import warnings
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=FitFailedWarning)

In [36]:
info = pd.read_excel('Final Cleaned Data.xlsx')
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1185 entries, 0 to 1184
Data columns (total 8 columns):
ASIN                    1185 non-null object
Description             1185 non-null object
Price                   1185 non-null float64
Verified Subcategory    1185 non-null object
Description_New         1185 non-null object
Description_Count       1185 non-null object
Description_Ngram       1185 non-null object
Description_Tfidf       1185 non-null object
dtypes: float64(1), object(7)
memory usage: 74.2+ KB


In [37]:
'''
    Model Selection:
        Here I choose MultinomialNB, RandomForestClassifier, and LogisticRegression
        1. MultinomialNB: 
            - can calculate the probability of the label (for this project: Category) given these word vectors
        2. RandomForestClassifier:
            - It overall performs well on classification problems 
            - I think the way Decision Tree works is similar to how human would classify the ASIN given the description
        3. LogisticRegression:
            - A simple and easy-to-understand model that always worth a try
        There are other models such as neural network that I believe will also perform well on this task. But for this project I will just use traditional machine learning models
'''
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
nb = MultinomialNB()
rf = RandomForestClassifier()
lr = LogisticRegression()

In [79]:
'''
    Feature Engineering:
        Here I choose some of the most common feature engineering tools for NLP are bag-of-word, n-gram, and tfidf
        1. Bag-of-word: 
            - word frequency count; 
            - one of the easiest but useful NLP feature engineering tool
        2. N-gram: 
            - N-consecutive word frequency count; 
            - taking the context of each word into account
        3. Tfidf: 
            - the word's weight is counted based on its frequency on the document and the corpus; 
            - useful for determine the document's topic among a collection of documents(corpus)
        
        Since all these products are related to av display technologies, I would expect they share similar description;
        Thus, TfidfVectorizer might be the best-perform vectorizer; 
        but I also include bag-of-words and ngram to have more variety on features
'''

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
X_count = CountVectorizer().fit_transform(info['Description_New'])
X_ngram = CountVectorizer(ngram_range=(2,3)).fit_transform(info['Description_New'])
X_tfidf = TfidfVectorizer().fit_transform(info['Description_New'])
y = info['Verified Subcategory']

In [82]:
'''
    Hyperparameter Tuning: 
        There are a few hyperparameter tuning methods that are commonly used: GridSearchCV, RandomizedSearchCV, Bayesian Optimization, etc. 
        I choose GridSearchCV for MultinomialNB and RandomizedSearchCV for RandomForestClassifier and LogisticRegression
        because of the dimension of each model's parameter list and these methods are provided by sklearn library  
'''
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
param_nb = {'alpha': [0.5, 1, 2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]}
param_rf = {
    'n_estimators': [20, 50, 100, 150, 200],
    'max_depth': [10, 30, 50, 70, None],
    'bootstrap': [True, False]
    }
param_lr = {
    'C': [1, 10, 100, 1000, 1500, 2000, 2500],
    'max_iter': [100, 200, 300, 400, 500, None],
    'fit_intercept' : [True, False],
    'multi_class' : ['auto', 'ovr', 'multinomial'],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
           }

In [40]:
'''
    Fitting bag-of-word to all three models with hyperparameter tuning
'''

gs_nb1 = GridSearchCV(estimator=nb, param_distributions=param_nb, cv=5, verbose=0)
gs_nb1.fit(X_count, y)
print("nb score:", gs_nb1.best_score_)
print("nb best params:", gs_nb1.best_params_)
gs_rf1 = RandomizedSearchCV(estimator=rf, param_distributions=param_rf, cv=5, verbose=0)
gs_rf1.fit(X_count, y)
print("rf score:", gs_rf1.best_score_)
print("rf best params:", gs_rf1.best_params_)
gs_lr1 = RandomizedSearchCV(estimator=lr, param_distributions=param_lr, cv=5, verbose=0)
gs_lr1.fit(X_count, y)
print("lr score:", gs_lr1.best_score_)
print("lr best params:", gs_lr1.best_params_)

nb score: 0.8135021097046413
nb best params: {'alpha': 1}
rf score: 0.8438818565400844
rf best params: {'n_estimators': 50, 'max_depth': None, 'bootstrap': False}
lr score: 0.839662447257384
lr best params: {'solver': 'saga', 'multi_class': 'ovr', 'max_iter': 100, 'fit_intercept': True, 'C': 1500}


In [47]:
'''
    Fitting n-gram to all three models with hyperparameter tuning
'''
gs_nb2 = GridSearchCV(estimator=nb, param_distributions=param_nb, cv=5, verbose=0)
gs_nb2.fit(X_ngram, y)
print("nb score:", gs_nb2.best_score_)
print("nb best params:", gs_nb2.best_params_)
gs_rf2 = RandomizedSearchCV(estimator=rf, param_distributions=param_rf, cv=5, verbose=0)
gs_rf2.fit(X_ngram, y)
print("rf score:", gs_rf2.best_score_)
print("rf best params:", gs_rf2.best_params_)
gs_lr2 = RandomizedSearchCV(estimator=lr, param_distributions=param_lr, cv=5, verbose=0)
gs_lr2.fit(X_ngram, y)
print("lr score:", gs_lr2.best_score_)
print("lr best params:", gs_lr2.best_params_)

nb score: 0.7856540084388186
nb best params: {'alpha': 1}
rf score: 0.8042194092827003
rf best params: {'n_estimators': 100, 'max_depth': 70, 'bootstrap': False}
lr score: 0.8540084388185653
lr best params: {'solver': 'liblinear', 'multi_class': 'ovr', 'max_iter': 300, 'fit_intercept': False, 'C': 1000}


In [48]:
'''
    Fitting tfidf to all three models with hyperparameter tuning
'''
gs_nb3 = GridSearchCV(estimator=nb, param_distributions=param_nb, cv=5, verbose=0)
gs_nb3.fit(X_tfidf, y)
print("nb score:", gs_nb3.best_score_)
print("nb best params:", gs_nb3.best_params_)
gs_rf3 = RandomizedSearchCV(estimator=rf, param_distributions=param_rf, cv=5, verbose=0)
gs_rf3.fit(X_tfidf, y)
print("rf score:", gs_rf3.best_score_)
print("rf best params:", gs_rf3.best_params_)
gs_lr3 = RandomizedSearchCV(estimator=lr, param_distributions=param_lr, cv=5, verbose=0)
gs_lr3.fit(X_tfidf, y)
print("lr score:", gs_lr3.best_score_)
print("lr best params:", gs_lr3.best_params_)

nb score: 0.6962025316455696
nb best params: {'alpha': 0.5}
rf score: 0.8430379746835441
rf best params: {'n_estimators': 200, 'max_depth': 70, 'bootstrap': False}
lr score: 0.8649789029535866
lr best params: {'solver': 'saga', 'multi_class': 'auto', 'max_iter': 400, 'fit_intercept': True, 'C': 100}


In [50]:
'''
    LogisticRegression model with tfidf feature is the best-performing model;
    I want to take a deeper look into how the model performs on each category and its different scoring
'''

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, stratify=y, test_size=0.3)
best_model = gs_lr3.best_estimator_
score_best = cross_val_score(best_model, X_tfidf, y, n_jobs=-1, verbose=0, cv=5)

best_model.fit(X_train, y_train)
pred = best_model.predict(X_test)
print(classification_report(y_test, pred))

                                precision    recall  f1-score   support

   DP and MDP Display Adapters       0.90      0.99      0.94       127
         HDMI and DVI Adapters       0.78      0.82      0.80        34
                          None       0.90      0.81      0.85        90
Thunderbolt 3 Display Adapters       1.00      0.75      0.86         4
        USB-A Display Adapters       1.00      0.97      0.98        30
        USB-C Display Adapters       0.96      0.90      0.93        71

                      accuracy                           0.91       356
                     macro avg       0.92      0.87      0.89       356
                  weighted avg       0.91      0.91      0.91       356



In [56]:
'''
    Overall score of 0.86 is not bad, and we have used hyperparameter tuning to combat overfitting problem.
    But still I wonder if we can further reduce model variance and biases.
    One way to reduce model variance is using ensemble method: Bagging 
'''

from sklearn.ensemble import BaggingClassifier

bg = BaggingClassifier(
    base_estimator=best_model,
    n_estimators=10
)

#bg.fit(X_train, y_train)
score_bg = cross_val_score(bg, X_tfidf, y, n_jobs=-1, verbose=0)
print("Bagging score:", score_bg.mean())
print("Best individual estimator score:", score_best.mean())

Bagging score: 0.8556962025316455
Best individual estimator score: 0.861603375527426


In [65]:
'''
    Boosting is another ensemble method that might reduce model’s bias
'''

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100)
score_gb = cross_val_score(gb, X_tfidf, y, n_jobs=-1, verbose=0, cv=5)
print("Gradient Boosting score:", score_gb.mean())
print("Best individual estimator score:", score_best.mean())

Gradient Boosting score: 0.8329113924050633
Best individual estimator score: 0.861603375527426


In [87]:
'''
    Since Bagging and Boosting don't seem to make improvements
    We will just save the best estimators for each engineered feature
'''
import joblib

joblib.dump(gs_rf1.best_estimator_, "Text_RF_count.pkl")
joblib.dump(gs_lr2.best_estimator_, "Text_LR_ngram.pkl")
joblib.dump(gs_lr3.best_estimator_, "Text_LR_tfidf.pkl")


['Text_LR_tfidf.pkl']