In [1]:
import sklearn
from sklearn import datasets
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [2]:
testdata=sklearn.datasets.fetch_20newsgroups(data_home=None, subset='test', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True)
data=sklearn.datasets.fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from time import time
import numpy as np
def load_data(data, input_features, target_feature, cat_features=None, num_features=None, txt_features=None):
    all_features = input_features + [target_feature]
    print(data.values.shape)
    data = data[all_features]
    data.dropna(subset=[target_feature], inplace=True)

    # change categorical features to numeric code
    if(cat_features!=None):
        data[cat_features] = data[cat_features].astype('category')
        data[cat_features] = data[cat_features].apply(lambda x: x.cat.codes)
    # replace nan with 0 in numerical features
    if(num_features!=None):
        data[num_features] = data[num_features].fillna(0.)
        for feature in num_features:
            data[feature] = data[feature].apply(lambda x: replace_string(x))
    if(txt_features!=None):
        if txt_features:
            data[txt_features] = data[txt_features].fillna('')

    return data
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
from sklearn.pipeline import Pipeline
def build_pipeline(model = DecisionTreeClassifier(random_state=0, criterion='gini')):
    return Pipeline(([
        ('vect',CountVectorizer()),
        ('tfidf',TfidfTransformer()),
        ('clf',model),
    ]))
def run_pipeline(text_ds,model=DecisionTreeClassifier(random_state=0, criterion='gini'),
                 gridsearch =False,
                 params ={'clf__splitter':['best','random'],
                          'tfidf__norm':['l1'],
                                    'clf__max_features':["auto","sqrt","log2",None],
                                    'clf__class_weight': ['balanced',None],
                                    'clf__min_samples_leaf':[1,5,100],
                                    'clf__min_samples_split':[2,100],
                                    'clf__max_depth':[10,25,50,100,None],
                                    'clf__max_leaf_nodes':[20,50,200,None]},
                ):
    pl = build_pipeline(model)
    pl.fit(text_ds.data,text_ds.target)
    
    if(gridsearch==True):
        search = GridSearchCV(pl, params, n_jobs=-1,verbose=1)
    else:
        search =  RandomizedSearchCV(pl, param_distributions=params,
                                   n_iter=10)
    start = time()
    search.fit(text_ds.data,text_ds.target)
    search.fit(data.data, data.target)
    print("SearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), 10))
    report(search.cv_results_)
    return(search)


In [28]:
build_pipeline(model=AdaBoostClassifier( random_state=0)).get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__algorithm', 'clf__base_estimator', 'clf__learning_rate', 'clf__n_estimators', 'clf__random_state'])

In [None]:
bestdecitree=run_pipeline(data)

In [4]:
bestada = run_pipeline(data,AdaBoostClassifier(random_state=0),
                      params={
                             'clf__n_estimators': [10,50,100],
                             'clf__learning_rate':[.1,.3,.5,.7,.9,1],
                             'clf__algorithm':['SAMME.R','SAMME'],
                             })



SearchCV took 991.76 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.543 (std: 0.018)
Parameters: {'clf__n_estimators': 50, 'clf__learning_rate': 0.9, 'clf__algorithm': 'SAMME.R'}

Model with rank: 2
Mean validation score: 0.542 (std: 0.018)
Parameters: {'clf__n_estimators': 100, 'clf__learning_rate': 0.9, 'clf__algorithm': 'SAMME.R'}

Model with rank: 3
Mean validation score: 0.538 (std: 0.010)
Parameters: {'clf__n_estimators': 50, 'clf__learning_rate': 0.5, 'clf__algorithm': 'SAMME.R'}



In [11]:
adapred = bestada.predict(testdata.data)
dtpred = bestdecitree(testdata.data)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(adapred,testdata.data)
confusion_matrix(dtpred,testdata.data)

In [7]:
#cuda
from sklearn.metrics import confusion_matrix, accuracy_score
def cross_validate(model,X_input,target,cv):
    return(cross_val_score(model, X_input, target, cv=cv))
cvada=cross_validate(bestada,testdata.data,testdata.target,3)
cvdecitree=cross_validate(bestdecitree,testdata.data,testdata.target,3)
pred = bestada.predict(testdata.data,testdata.target)

print(cvada)
print(cvdecitree)



array([0.44201747, 0.39721116, 0.4692492 ])