In [1]:
!ls data/amazon-reviews/

baby-products.csv        grocery-gourmet-food.csv toys-games.csv
beauty.csv               health-personal-care.csv train_40k.csv
cat1.csv                 pet-supplies.csv         val_10k.csv


# Top level classifier

In [5]:
from sklearn import *
import csv
import pandas as pd
from sklearn.pipeline import Pipeline

In [55]:
clf1 = Pipeline(
    steps=[
        ('tok', feature_extraction.text.TfidfVectorizer(min_df = 3, max_df=0.8, ngram_range=[1,2])),
        ('clf', svm.LinearSVC(penalty='l2', multi_class='ovr', class_weight='balanced'))
    ]
)

In [56]:
df1 = pd.read_csv('data/amazon-reviews/cat1.csv', index_col=False); df1.head()

Unnamed: 0.1,Unnamed: 0,Cat1,text_feature
0,0,grocery gourmet food,Golden Valley Natural Buffalo Jerky The descri...
1,1,toys games,Westing Game This was a great book!!!! It is w...
2,2,toys games,"Westing Game I am a first year teacher, teachi..."
3,3,toys games,Westing Game I got the book at my bookfair at ...
4,4,toys games,I SPY A is For Jigsaw Puzzle 63pc Hi! I'm Mart...


In [57]:
X = list(df1['text_feature'])
Y = list(df1['Cat1'])
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X,Y, train_size=0.8, random_state=1)

In [58]:
assert len(X_train) == len(Y_train)

In [59]:
clf1.fit(X_train, Y_train)

Pipeline(steps=[('tok',
                 TfidfVectorizer(max_df=0.8, min_df=3, ngram_range=[1, 2])),
                ('clf', LinearSVC(class_weight='balanced'))])

In [60]:
Y_pred = clf1.predict(X_test)
print(metrics.classification_report(Y_test, Y_pred))
print(metrics.confusion_matrix(Y_test, Y_pred))

                      precision    recall  f1-score   support

       baby products       0.96      0.95      0.96      1161
              beauty       0.92      0.94      0.93      1129
grocery gourmet food       0.93      0.95      0.94       724
health personal care       0.93      0.92      0.93      1946
        pet supplies       0.97      0.96      0.96       957
          toys games       0.97      0.97      0.97      2083

            accuracy                           0.95      8000
           macro avg       0.95      0.95      0.95      8000
        weighted avg       0.95      0.95      0.95      8000

[[1106    8    0    9    5   33]
 [   3 1058    7   55    2    4]
 [   2    3  690   25    0    4]
 [  13   72   35 1797   14   15]
 [   5    5    4   19  916    8]
 [  19    3    2   24   12 2023]]


In [63]:
clf1.decision_function(X_test)[0]
Y_pred[0]

'baby products'

- [x] Look at examples where model is incorrect - might tell us about the data
Conclusion: baby products vs toys games, health personal care vs beauty, grocery gourmet food vs health personal care... there are a lot of instances where these classes share many of their features. Not surprising that it mixes them up from time to time. 
- [ ] Create loop that train models for each sub category

In [23]:
incorrect_ids = []
for i, (y_pred, y_test) in enumerate(zip(Y_pred, Y_test)):
    if y_pred != y_test:
        incorrect_ids.append(i)

In [29]:
len(incorrect_ids)

418

In [26]:
incorrect_preds = []
for i in incorrect_ids:
    incorrect_preds.append({
        "text": X_test[i],
        "true": Y_test[i],
        "pred": Y_pred[i]
    })
incorrect_df = pd.DataFrame(incorrect_preds)

In [27]:
incorrect_df.head()

Unnamed: 0,text,true,pred
0,Natural Oatmeal Shampoo - Gallon I have used t...,pet supplies,beauty
1,Blue Mountain Wallcoverings GAPP1760 Pooh Scen...,baby products,toys games
2,Tweezerman Safety Slide Callus Shaver with Ras...,beauty,health personal care
3,Personalized Stainless Steel Beaded Spoon I ju...,baby products,toys games
4,"Ricochet Sugar Free Mints with Xylitol, Fruit ...",health personal care,grocery gourmet food


In [28]:
incorrect_df.to_csv('cat1-errors.csv')

# Sub category classifiers

In [30]:
!ls data/amazon-reviews/

baby-products.csv        grocery-gourmet-food.csv toys-games.csv
beauty.csv               health-personal-care.csv train_40k.csv
cat1.csv                 pet-supplies.csv         val_10k.csv


In [41]:
pd.read_csv('data/amazon-reviews/baby-products.csv')["Cat2-relative"].unique()

array([nan, 'gear', 'gifts', 'feeding', 'diapering', 'safety', 'nursery',
       'bathing skin care', 'car seats accessories', 'strollers',
       'pregnancy maternity', 'potty training', 'health baby care'],
      dtype=object)

In [35]:
file_list = [
    ("baby product", "data/amazon-reviews/baby-products.csv"),
    ("beauty", "data/amazon-reviews/beauty.csv"),
    ("grocery gourmetfood", "data/amazon-reviews/grocery-gourmet-food.csv"),
    ("healthpersonal care", "data/amazon-reviews/health-personal-care.csv"),
    ("pet supplies", "data/amazon-reviews/pet-supplies.csv"),
    ("toys games", "data/amazon-reviews/toys-games.csv")
]

In [64]:
subcat_dict = {}
for (c, f) in file_list:
    print(c, f)
    clf = Pipeline(
        steps=[
            ('tok', feature_extraction.text.TfidfVectorizer(min_df = 3, max_df=0.8, ngram_range=[1,2])),
            ('clf', svm.LinearSVC(penalty='l2', multi_class='ovr', class_weight='balanced'))
        ]
    )
    sub_df = pd.read_csv(f).fillna('n/a')
    X = sub_df['text_feature']
    Y = sub_df['Cat2-relative']
    
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X,Y, train_size=0.8, random_state=1)
    
    
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    Y_scores = clf.decision_function(X_test)
    subcat_dict[c] = {
        "clf": clf,
        "X_train": X_train,
        "X_test": X_test,
        "Y_train": Y_train,
        "Y_test": Y_test,
        "Y_scores": Y_scores
    }
    print(metrics.classification_report(Y_test, Y_pred))
    print(metrics.confusion_matrix(Y_test, Y_pred))
    print("---------------------------------------------")

baby product data/amazon-reviews/baby-products.csv
                       precision    recall  f1-score   support

    bathing skin care       0.84      0.75      0.80        65
car seats accessories       0.87      0.93      0.90        73
            diapering       0.94      0.97      0.96       221
              feeding       0.95      0.91      0.93       191
                 gear       0.87      0.91      0.89       138
                gifts       1.00      0.78      0.88        27
     health baby care       1.00      0.58      0.74        12
                  n/a       0.99      0.99      0.99      6839
              nursery       0.95      0.84      0.89       178
       potty training       0.87      0.90      0.88        29
  pregnancy maternity       0.96      1.00      0.98        23
               safety       0.97      0.84      0.90       153
            strollers       0.92      0.86      0.89        51

             accuracy                           0.98      8000
  

  _warn_prf(average, modifier, msg_start, len(result))


                                  precision    recall  f1-score   support

                       baby food       1.00      0.69      0.82        13
                       beverages       0.91      0.91      0.91       168
                   breads bakery       0.63      0.70      0.67        27
                 breakfast foods       0.85      0.88      0.86        58
                 candy chocolate       0.87      0.82      0.84        71
         cooking baking supplies       0.40      0.18      0.25        11
                      dairy eggs       0.88      0.78      0.82         9
fresh flowers live indoor plants       1.00      0.78      0.88         9
                   gourmet gifts       1.00      0.50      0.67        16
                           herbs       0.57      0.31      0.40        13
                    meat poultry       0.80      0.67      0.73         6
                    meat seafood       1.00      0.12      0.22         8
                             n/a     