This exercise was taken from Paweł Jankiewicz and partially modified
https://github.com/logicai-io/pipelines-sklearn

In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/categories.csv').loc[lambda x: x.short_description.str.len() > 10]

In [3]:
df.head()

Unnamed: 0,website,categories,short_description
0,http://iFans.com,['News'],iFans is a community-based forum and portal th...
1,http://www.braingig.com,"['Non Profit', 'Finance']",Connecting grant funders and seekers.
2,https://www.twinelabs.com/,[],Twine is a powerful platform for internal mobi...
3,http://www.SumaGreen.com,['Biotechnology'],SumaGreen is an agro firm committed to enablin...
4,http://worldstartupreport.strikingly.com/,"['Market Research', 'CleanTech', 'Clean Energy']",World Startup Report is a social mission to do...


In [4]:
df.dtypes

website              object
categories           object
short_description    object
dtype: object

In [5]:
import ast
def cut_array(l):
    l_ = ast.literal_eval(l)
    if len(l_)>2:
        return l_[0:1]
    else:
        return l_

df['categories_list'] = df['categories'].apply(lambda x: cut_array(x))
df = df[df.categories != '[]']

In [6]:
df['categories'].head()

0                                            ['News']
1                           ['Non Profit', 'Finance']
3                                   ['Biotechnology']
4    ['Market Research', 'CleanTech', 'Clean Energy']
5                          ['Accounting', 'Software']
Name: categories, dtype: object

In [7]:
df.groupby('categories').size().head()

categories
["Men's", "Women's", 'Beauty', 'Fashion']       1
["Men's", "Women's", 'Beauty']                  1
["Men's", "Women's", 'Children', 'Textiles']    1
["Men's", "Women's", 'Events']                  1
["Men's", "Women's", 'Fashion']                 7
dtype: int64

In [8]:
X = df.short_description.values
y = df.categories_list.values

The task is to predict not one but multiple categories for each observation. 
One of the solutions is to create a binary classifier for each unique category.
It is fairly simple to do using scikit-learn but we need to create our own classifier.

Task:

1. Write a custom classifier to solve this.
2. Evaluate its results (what measure could be good for comparing sets?)


Hints:

You can keep your classifiers in a dictionary `class name -> Classifier`
Both in fit and predict you need to iterate over all unique classes.

In [9]:
def load_data():
    # get all rows where the short description is big enough
    df = pd.read_csv('../data/categories.csv').loc[lambda x: x.short_description.str.len() > 10]
    
    # convert the categories list from string to list
    df['categories_list'] = df['categories'].apply(lambda x: cut_array(x))
    
    # remove the rows without categories
    df = df[df.categories != '[]']
    
    #print(df.head())
    return df.short_description.values, df.categories_list.values

In [10]:
from itertools import chain

# make the classifier
class OneVsRestClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, base_estimator):
        self.base_estimator = base_estimator
        self.estimators = {}

    def fit(self, X, y=None, sample_weight=None):
        # get all the unique categories
        self.classes = np.unique([item for sub in y for item in sub])
        
        print("Fitting")
        
        # for each category
        for cl in self.classes:
            # clone base estimator to remove previous configurations
            est_ = clone(self.base_estimator) 
            # fit the data to a vector with 0's and 1's according to if the category is present for the sample
            est_.fit(X, self._isin(y, cl)) 
            # save this estimator in the list of estimators
            self.estimators[cl] = est_
        return self

    def predict(self, X):
        print("Predicting")
        # make an empty predicted category list for each sample
        outputs = [[] for _ in range(X.shape[0])]
        
        # for each category
        for cl in self.classes:
            # get the indices for the samples where the category is predicted
            true_indices = np.where(self.estimators[cl].predict(X) == 1)[0]
            
            # for each sample with this predicted category add the category name to the output vector
            for i in true_indices:
                outputs[i].append(cl)
        return outputs
    
    def _isin(self, ys, cl):
        # create the vector with 0's and 1's according to if the category is present for the sample
        return np.array([cl in y for y in ys], dtype=np.int)
    

In [11]:
# get the f1 score
def f1(true, pred):
    #print(f'true: {true} - pred: {pred}')
    if len(pred) == 0:
        return 0
    tp = len(set(true).intersection(set(pred)))
    precision = tp / len(pred)
    recall = tp / len(true)
    if (precision + recall) == 0:
        return 0
    else:
        #print(f'f: {2 * precision * recall / (precision + recall)}')
        return (2 * precision * recall / (precision + recall))
    


In [12]:
# add a CountVectorizer function and save in a pipeline
def model_definition_words(min_df) -> Pipeline:
    est = make_pipeline(
        CountVectorizer(min_df=min_df, binary=True, analyzer='word', stop_words='english'),
        OneVsRestClassifier(base_estimator=AdaBoostClassifier())
    )
    return est


In [13]:
# wrapper function
import warnings
warnings.filterwarnings('ignore')

def validate_model_multiple_outputs(min_df):
    print('Loading data')
    X, y = load_data()
    
    X_tr, X_te, y_tr, y_te = train_test_split(X[:10000], y[:10000], random_state=1)
    
    # show the first lines of the test data
    #print('test data:')
    #print(pd.DataFrame(np.transpose(np.vstack((X_te[:5], y_te[:5])))))
    
    est = model_definition_words(min_df)
    
    est.fit(X_tr, y_tr)
    
    preds = est.predict(X_te)
    
    mean_f1 = np.array([f1(true, pred) for true, pred in zip(y_te, preds)]).mean()
    print("Multiple Labels F1", mean_f1)
    return X_tr, X_te, y_tr, y_te, preds
    
#X_tr, X_te, y_tr, y_te, preds = validate_model_multiple_outputs()

In [17]:
for i in range(6, 9, 1):
    print('-----------------')
    print(f'min_df = {i}')
    X_tr, X_te, y_tr, y_te, preds = validate_model_multiple_outputs(i)

-----------------
min_df = 6
Loading data
Fitting
Predicting
Multiple Labels F1 0.08905333333333333
-----------------
min_df = 7
Loading data
Fitting
Predicting
Multiple Labels F1 0.08650666666666666
-----------------
min_df = 8
Loading data


KeyboardInterrupt: 

In [None]:
print(len(X_te))
print(len(y_te))
print(len(preds))

In [None]:
preds_ = []
for row in preds:
    cats = "["
    for i, cat in enumerate(row):
        if i != 0:
            cats += ", "
        cats += "'" + cat + "'"
    cats += "]"
    preds_.append(cats)


In [None]:
overview = np.transpose(np.vstack((X_te, y_te, preds)))
pd.DataFrame(overview, columns=['description', 'truth', 'predicted'])

**Solution1**
<div>
def load_data():
    df = pd.read_csv('data/categories.csv').loc[lambda x: x.short_description.str.len() > 10]
    df['categories_list'] = df['categories'].apply(lambda x: cut_array(x))
    df = df[df.categories != '[]']
    print(df.head())
    return df.short_description.values, df.categories_list.values

from itertools import chain


class OneVsRestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator):
        self.base_estimator = base_estimator
        self.estimators = {}

    def fit(self, X, y=None, sample_weight=None):
        self.classes = np.unique([item for sub in y for item in sub])
        print("Fitting")
        for cl in self.classes:
            est_ = clone(self.base_estimator)
            est_.fit(X, self._isin(y, cl))
            self.estimators[cl] = est_
        return self

    def predict(self, X):
        print("Predicting")
        outputs = [[] for _ in range(X.shape[0])]
        for cl in self.classes:
            true_indices = np.where(self.estimators[cl].predict(X) == 1)[0]
            for i in true_indices:
                outputs[i].append(cl)
        return outputs
    
    def _isin(self, ys, cl):
        return np.array([cl in y for y in ys], dtype=np.int)
    
    
def f1(true, pred):
    if len(pred) == 0:
        return 0
    tp = len(set(true).intersection(set(pred)))
    precision = tp / len(pred)
    recall = tp / len(true)
    if (precision + recall) == 0:
        return 0
    else:
        return (2 * precision * recall / (precision + recall))
    
    
def model_definition_words() -> Pipeline:
    est = make_pipeline(
        CountVectorizer(min_df=5, binary=True, analyzer='word'),
        OneVsRestClassifier(base_estimator=RandomForestClassifier())
    )
    return est


def validate_model_multiple_outputs():
    print('Loading data')
    X, y = load_data()
    
    X_tr, X_te, y_tr, y_te = train_test_split(X[:10000], y[:10000], random_state=1)
    est = model_definition_words()
    est.fit(X_tr, y_tr)
    mean_f1 = np.array([f1(true, pred) for true, pred in zip(y_te, preds)]).mean()
    print("Multiple Labels F1", mean_f1)
    
validate_model_multiple_outputs()
</div>

**Solution 2**
<div>

def load_data():
    df = pd.read_csv('data/categories.csv').loc[lambda x: x.short_description.str.len() > 10]
    print(df.head())
    return df.short_description.values, df.categories.values


class OneVsRestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator):
        self.base_estimator = base_estimator
        self.estimators = {}

    def fit(self, X, y=None, sample_weight=None):
        self.classes = list(set(chain(*y)))
        print("Fitting")
        for cl in self.classes:
            est_ = clone(self.base_estimator)
            est_.fit(X, self._isin(y, cl))
            self.estimators[cl] = est_
        return self

    def predict(self, X):
        print("Predicting")
        outputs = [[] for _ in range(X.shape[0])]
        for cl in self.classes:
            true_indices = np.where(self.estimators[cl].predict(X) == 1)[0]
            for i in true_indices:
                outputs[i].append(cl)
        return outputs

    def _isin(self, ys, cl):
        return np.array([cl in y for y in ys], dtype=np.int)
    
    
def f1(true, pred):
    if len(pred) == 0:
        return 0
    tp = len(set(true).intersection(set(pred)))
    precision = tp / len(pred)
    recall = tp / len(true)
    if (precision + recall) == 0:
        return 0
    else:
        return (2 * precision * recall / (precision + recall))
    
    
def model_definition_words() -> Pipeline:
    est = make_pipeline(
        CountVectorizer(min_df=5, binary=True, analyzer='word'),
        OneVsRestClassifier(base_estimator=RandomForestClassifier(n_estimators=100, min_samples_leaf=10, min_samples_split=20,
                                                        n_jobs=-2))
    )
    return est


def validate_model_multiple_outputs():
    print('Loading data')
    X, y = load_data()
    X_tr, X_te, y_tr, y_te = train_test_split(X[:10000], y[:10000], random_state=1)
    est = model_definition_words()
    est.fit(X_tr, y_tr)
    preds = est.predict(X_te)
    mean_f1 = np.array([f1(true, pred) for true, pred in zip(y_te, preds)]).mean()
    print("Multiple Labels F1", mean_f1)
    
validate_model_multiple_outputs()
</div>