In [1]:
from multiprocessing import Pool
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer  
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
import xgboost as xgb
import nltk
import numpy as np

In [2]:
cfpb = pd.read_csv("/home/spenser/cfpb/complaints.csv")

In [3]:
cfpb = cfpb[~cfpb['Consumer complaint narrative'].isnull()]

In [4]:
companies = cfpb['Company'].value_counts().reset_index()
companies.columns = ['Company', 'count']

In [5]:
companies = companies[companies['count'] > 3]

In [6]:
companies.columns = ['Company', 'count']

In [7]:
cfpb = cfpb.merge(companies, on =['Company'], how='inner')

### Preprocess text

In [8]:
cfpb['lower_text'] = cfpb['Consumer complaint narrative'].str.lower()

In [9]:
# import these modules
ps = PorterStemmer()

In [None]:


def stem(text):
    stemmed = []
    for token in text.split(' '):
        stemmed_token = ps.stem(token)
        stemmed.append(stemmed_token)
    detokenized = TreebankWordDetokenizer().detokenize(stemmed)
    return detokenized

from multiprocessing import Pool

p = Pool(11)

cfpb['texts_stemmed'] = p.map(stem,  cfpb['lower_text'])

In [None]:
cfpb = cfpb.reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(cfpb['texts_stemmed'],  pd.Categorical(cfpb['Company']).codes, stratify = pd.Categorical(cfpb['Company']).codes, test_size=0.15, random_state=42)

#### Compare RandomForest and Xgboost processing times


Random Forest: 
svd: n_components = 100
max_depth: 10
n_estimators: 100

xgboost: 
svd: n_components = 250
max_depth: 10
n_estimators: 10 

In [28]:
components = 100

In [15]:
%%time
pipeline_rf = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('svd', TruncatedSVD(n_components= components, random_state=0)),
                     ('clf', RandomForestClassifier(max_depth=10, n_estimators=200,  verbose=1, n_jobs=11))])

pipeline_rf.fit(X_train, y_train)

[Parallel(n_jobs=11)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  28 tasks      | elapsed:  8.5min
[Parallel(n_jobs=11)]: Done 178 tasks      | elapsed: 47.1min


CPU times: user 9h 25min 4s, sys: 41.1 s, total: 9h 25min 45s
Wall time: 54min 40s


[Parallel(n_jobs=11)]: Done 200 out of 200 | elapsed: 52.2min finished


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('svd', TruncatedSVD(n_components=100, random_state=0)),
                ('clf',
                 RandomForestClassifier(max_depth=10, n_estimators=200,
                                        n_jobs=11, verbose=1))])

In [16]:
%%time
pipeline_xg = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('svd', TruncatedSVD(n_components= components, random_state=0)),
                     ('clf', xgb.XGBClassifier(max_depth = 10, n_estimators=10, n_jobs=11, objective="multi:softprob", random_state=42))])

pipeline_xg.fit(X_train, y_train)



CPU times: user 1d 3h 39min 22s, sys: 57.8 s, total: 1d 3h 40min 20s
Wall time: 2h 39min 6s


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('svd', TruncatedSVD(n_components=100, random_state=0)),
                ('clf',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=10, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=10,
                               n_jobs=11, num_parallel_tree=1,
                               objective='multi:softprob', predictor='auto',
                               random_state=42, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=None, subsample=1,


In [17]:
preds_rf = pipeline_rf.predict(X_test)
preds_xg = pipeline_xg.predict(X_test)

[Parallel(n_jobs=11)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  28 tasks      | elapsed:   23.5s
[Parallel(n_jobs=11)]: Done 178 tasks      | elapsed:  1.9min
[Parallel(n_jobs=11)]: Done 200 out of 200 | elapsed:  2.1min finished


In [18]:
import numpy as np
preds_rf = pipeline_rf.predict(X_test)
np.unique(preds_rf)

[Parallel(n_jobs=11)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  28 tasks      | elapsed:   29.6s
[Parallel(n_jobs=11)]: Done 178 tasks      | elapsed:  2.1min
[Parallel(n_jobs=11)]: Done 200 out of 200 | elapsed:  2.3min finished


array([ 329,  449,  495,  936, 1000, 1444, 1817, 1899, 1969, 2082, 2613,
       2851], dtype=int16)

In [19]:
import numpy as np
np.unique(preds_xg)

array([  61,  329,  495,  849,  936, 1000, 1444, 1899, 2039, 2296, 2416,
       2613, 2725], dtype=int16)

In [25]:
from sklearn.metrics import f1_score

f1_score(preds_rf, y_test, average='micro')

0.301699896131533

In [26]:
from sklearn.metrics import f1_score

f1_score(preds_xg, y_test, average='micro')

0.08165074861297596

#### XGboost - class weighting with balanced subsamples

In [29]:
%%time
pipeline_rf = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('svd', TruncatedSVD(n_components= components, random_state=0)),
                     ('clf', RandomForestClassifier(max_depth=10, 
                                                    n_estimators=200, 
                                                    class_weight = 'balanced_subsample', 
                                                    verbose=1, 
                                                    n_jobs=12))])

pipeline_rf.fit(X_train, y_train)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.


KeyboardInterrupt: 