In [84]:
from sklearn.datasets import fetch_20newsgroups

categories = ['rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt']

news = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=228)
print("Categories: ", news.target_names)
print("Ddocuments number: ", len(news.data))

Categories:  ['rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt']
Ddocuments number:  3980


In [85]:
labels = news.target

**Exctracting features:**

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.6, min_df=0.05, 
                             stop_words='english', use_idf=True)

In [87]:
features = vectorizer.fit_transform(news.data)

print("Examples number: %d\nFeatures number: %d" % features.shape)

Examples number: 3980
Features number: 252


**Split the data:**

In [88]:
train_features, dev_test_features, train_labels, dev_test_labels = \
  ms.train_test_split(features, labels, train_size=0.8, random_state=228)
dev_features, test_features, dev_labels, test_labels = \
  ms.train_test_split(dev_test_features, dev_test_labels, train_size=0.5, random_state=228)

**Test:**

In [89]:
import sklearn.metrics

def test(model, dev_features, dev_labels):
    predicted_dev_labels = model.predict(dev_features)
    print(sklearn.metrics.classification_report(dev_labels, predicted_dev_labels))
    print()
    print(sklearn.metrics.accuracy_score(test_labels, predicted_dev_labels))

**Classification baseline:**

In [90]:
import sklearn.tree

tree_model = sklearn.tree.DecisionTreeClassifier()
tree_model.fit(train_features, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [91]:
test(tree_model, dev_features, dev_labels)

             precision    recall  f1-score   support

          0       0.85      0.96      0.90        97
          1       0.78      0.73      0.75        99
          2       0.80      0.82      0.81       105
          3       0.94      0.88      0.91        97

avg / total       0.84      0.84      0.84       398


0.256281407035


**Using XGBoost model:** 

In [92]:
import xgboost

xgb_model = xgboost.XGBClassifier()
xgb_model.fit(train_features, train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [93]:
test(xgb_model, dev_features, dev_labels)

             precision    recall  f1-score   support

          0       0.96      0.91      0.93        97
          1       0.82      0.88      0.85        99
          2       0.86      0.91      0.89       105
          3       0.99      0.91      0.95        97

avg / total       0.91      0.90      0.90       398


0.248743718593


**Find appropriate hyperparameters for XGBoost model:**

In [94]:
import sklearn.model_selection as ms

def find_best_params(model, param_grid, train_features, train_labels):
    cv = ms.KFold(n_splits=3, shuffle=True, random_state=228)
    grid_search = ms.GridSearchCV(model, param_grid, cv=cv)
    grid_result = grid_search.fit(train_features, train_labels)
    
    print("Best: %0.4f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
n_estimators = [75, 85, 100, 150, 200]
max_depth = [2, 3, 4, 6, 8]
learning_rate = [0.001, 0.01, 0.1]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, learning_rate=learning_rate)

find_best_params(xgboost.XGBClassifier(), param_grid, train_features, train_labels)

**Test XGBoost model with found parameters:**

In [83]:
xgb_model_opt = xgboost.XGBClassifier(n_estimators=150, max_depth=8, learning_rate=0.1)
xgb_model_opt.fit(dev_features, dev_labels)
test(xgb_model_opt, test_features, test_labels)

             precision    recall  f1-score   support

          0       0.87      0.81      0.84        90
          1       0.63      0.77      0.69        96
          2       0.81      0.70      0.75        98
          3       0.96      0.94      0.95       114

avg / total       0.82      0.81      0.81       398


0.811557788945
