In [1]:
!pip freeze

appnope==0.1.0
argon2-cffi==20.1.0
attrs==19.3.0
backcall==0.2.0
bleach==3.1.5
blis==0.4.1
boto==2.49.0
boto3==1.14.44
botocore==1.17.44
catalogue==1.0.0
certifi==2020.6.20
cffi==1.14.2
chardet==3.0.4
click==7.1.2
cycler==0.10.0
cymem==2.0.3
decorator==4.4.2
defusedxml==0.6.0
docutils==0.15.2
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
entrypoints==0.3
gensim==3.8.3
idna==2.10
ipykernel==5.3.4
ipython==7.17.0
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi==0.17.2
Jinja2==2.11.2
jmespath==0.10.0
joblib==0.16.0
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.1.6
jupyter-console==6.1.0
jupyter-core==4.6.3
kiwisolver==1.2.0
MarkupSafe==1.1.1
matplotlib==3.3.1
mistune==0.8.4
murmurhash==1.0.2
nbconvert==5.6.1
nbformat==5.0.7
networkx==2.5
nltk==3.5
notebook==6.1.3
numpy==1.19.1
packaging==20.4
pandas==1.1.1
pandocfilters==1.4.2
parso==0.7.1
pexpect==4.8.0


In [1]:
import pandas as pd
import numpy as np
from sklearn import *
import texthero
from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT
from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled

In [2]:
import inspect
print(inspect.getsourcefile(HierarchicalClassifier))

/Users/Viktor/miniconda3/envs/hier-clf/lib/python3.8/site-packages/sklearn_hierarchical_classification/classifier.py


In [3]:
path = 'data/train_40k.csv'

df_raw = pd.read_csv(path)


In [4]:
df = df_raw[['Title', 'Text', 'Cat1', 'Cat2']]
df.head()

Unnamed: 0,Title,Text,Cat1,Cat2
0,Golden Valley Natural Buffalo Jerky,The description and photo on this product need...,grocery gourmet food,meat poultry
1,Westing Game,This was a great book!!!! It is well thought t...,toys games,games
2,Westing Game,"I am a first year teacher, teaching 5th grade....",toys games,games
3,Westing Game,I got the book at my bookfair at school lookin...,toys games,games
4,I SPY A is For Jigsaw Puzzle 63pc,Hi! I'm Martine Redman and I created this puzz...,toys games,puzzles


In [5]:
df['clean_text'] = df.Text.pipe(texthero.clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df.Text.pipe(texthero.clean)


In [6]:
df.head()

Unnamed: 0,Title,Text,Cat1,Cat2,clean_text
0,Golden Valley Natural Buffalo Jerky,The description and photo on this product need...,grocery gourmet food,meat poultry,description photo product needs changed indica...
1,Westing Game,This was a great book!!!! It is well thought t...,toys games,games,great book well thought easily imagine events ...
2,Westing Game,"I am a first year teacher, teaching 5th grade....",toys games,games,first year teacher teaching 5th grade special ...
3,Westing Game,I got the book at my bookfair at school lookin...,toys games,games,got book bookfair school looking something sum...
4,I SPY A is For Jigsaw Puzzle 63pc,Hi! I'm Martine Redman and I created this puzz...,toys games,puzzles,hi martine redman created puzzle briarpatch us...


## Create the hierarchy

In [7]:
cat1 = list(df.Cat1.unique())
cat1

['grocery gourmet food',
 'toys games',
 'beauty',
 'health personal care',
 'baby products',
 'pet supplies']

In [8]:
cat2_counts = df.Cat2.value_counts()

In [9]:
hier = {}
hier[ROOT] = cat1
threshold = 750
for c in cat1:
    hier[c] = [c for c in df[df.Cat1 == c].Cat2.unique() if cat2_counts[c] > threshold]

included_cat2 = [j for i in hier.values() for j in i]

In [10]:
included_cat2

['grocery gourmet food',
 'toys games',
 'beauty',
 'health personal care',
 'baby products',
 'pet supplies',
 'beverages',
 'pantry staples',
 'games',
 'action toy figures',
 'dolls accessories',
 'baby toddler toys',
 'electronics for kids',
 'makeup',
 'skin care',
 'hair care',
 'fragrance',
 'personal care',
 'nutrition wellness',
 'household supplies',
 'health care',
 'medical supplies equipment',
 'feeding',
 'diapering',
 'cats',
 'dogs']

## Create the features
The `.fit()` method expects features as input. This is somehting that would need to change if we want to use this in production. 

In [11]:
df = df[df.Cat2.isin(included_cat2)]
df.head()

Unnamed: 0,Title,Text,Cat1,Cat2,clean_text
1,Westing Game,This was a great book!!!! It is well thought t...,toys games,games,great book well thought easily imagine events ...
2,Westing Game,"I am a first year teacher, teaching 5th grade....",toys games,games,first year teacher teaching 5th grade special ...
3,Westing Game,I got the book at my bookfair at school lookin...,toys games,games,got book bookfair school looking something sum...
5,ThinkFun Rush Hour,"My eight year old loves this game, whenever he...",toys games,games,eight year old loves game whenever coax away f...
6,Beetle Juice (1988),The real joy of this movie doesn't lie in its ...,grocery gourmet food,beverages,real joy movie lie fun special effects twisted...


In [12]:
df['cat1_cat2'] = df.apply(lambda x: [x['Cat1'], x['Cat2']], axis=1)

In [13]:
df.head()

Unnamed: 0,Title,Text,Cat1,Cat2,clean_text,cat1_cat2
1,Westing Game,This was a great book!!!! It is well thought t...,toys games,games,great book well thought easily imagine events ...,"[toys games, games]"
2,Westing Game,"I am a first year teacher, teaching 5th grade....",toys games,games,first year teacher teaching 5th grade special ...,"[toys games, games]"
3,Westing Game,I got the book at my bookfair at school lookin...,toys games,games,got book bookfair school looking something sum...,"[toys games, games]"
5,ThinkFun Rush Hour,"My eight year old loves this game, whenever he...",toys games,games,eight year old loves game whenever coax away f...,"[toys games, games]"
6,Beetle Juice (1988),The real joy of this movie doesn't lie in its ...,grocery gourmet food,beverages,real joy movie lie fun special effects twisted...,"[grocery gourmet food, beverages]"


In [14]:
train, test = model_selection.train_test_split(df, train_size=0.8, random_state=1)

In [15]:
tfidf_transform = feature_extraction.text.TfidfVectorizer(min_df=2, max_df=0.8, max_features=8192)
tfidf_transform.fit(train.clean_text)

TfidfVectorizer(max_df=0.8, max_features=8192, min_df=2)

In [16]:
train['features'] = train.clean_text.pipe(tfidf_transform.transform)
test['features'] = test.clean_text.pipe(tfidf_transform.transform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['features'] = train.clean_text.pipe(tfidf_transform.transform)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['features'] = test.clean_text.pipe(tfidf_transform.transform)


## Create the hierarchical classifier

In [17]:
len(train)

22697

In [19]:
from tqdm import tqdm

base_estimator = pipeline.make_pipeline(
    svm.SVC(kernel='linear', probability=True)
)

clf = HierarchicalClassifier(
    base_estimator=base_estimator,
    class_hierarchy=hier,
    prediction_depth='nmlnp',
    algorithm='lcn',
    stopping_criteria=0.5,
    training_strategy='inclusive',
    progress_wrapper=tqdm
)

In [20]:
# Pandas stores the 
X = tfidf_transform.transform(train.clean_text)

In [21]:
import tqdm
clf.fit(X, train.cat1_cat2)

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.

In [None]:
X_test = tfidf_transform.transform(test.clean_text)
Y_pred = clf.predict(X_test)

In [None]:
print(metrics.classification_report(test.Cat2, Y_pred))