# IMDB Sentiment Classification

In [0]:
# Mount project directory

from pathlib import Path

cur_dir = !pwd
cur_dir = cur_dir[0]

# For online use

from google.colab import drive
drive.mount('/content/drive')
project_dir = Path(cur_dir).resolve() / \
  'drive/My Drive/School/McGill' / \
    'COMP 551 - Applied Machine Learning' / \
      'Project 2 - IMDB Sentiment Classification'


# For local use
"""
project_dir = Path(cur_dir).resolve()
"""
processed_path = project_dir / 'data/processed'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Importing data

import json

def get_dataset(path):
    return json.load(open(path))

def get_train_test_data(input_path, filenames):
    train_test_data = [
        get_dataset(input_path / filename)
        for filename in filenames
    ]
    return train_test_data

filenames = (
    'X_train.json',
    'X_test.json',
    'y_train.json',
)
X_train, \
X_test, \
y_train = get_train_test_data(processed_path, filenames)

## 1. Bernoulli Naive Bayes

### Implementation

In [0]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import binarize as bin

NEGATIVE, POSITIVE = 0, 1

class BernoulliNaiveBayes(BaseEstimator, ClassifierMixin):
    """
    Bernoulli Naive Bayes classifier with 2 classes.
    """
    def __init__(self, binarize=.0, k=1):
        """
        Set:
            k value for Laplacian Smoothing
            binarize threshold: values = 1 if values > threshold else 0
        """
        self.binarize = binarize
        self.k = k

    def fit(self, X, y):
        """
        Input:
            X: n*m csr_matrix (sparse matrix)
            y: list of length n
        """
        k = self.k
        n, m = X.shape
        X = bin(X, threshold=self.binarize)
        num_y_1 = np.sum(y)
        num_y_0 = n - num_y_1
        

        """
        Define
            theta_1 = (# of examples where y=1) / (total # of examples)
            theta_j_1 = (# examples with xj=1 and y=1) / (# examples with y=1)
            theta_j_0 = (# examples with xj=1 and y=0) / (# examples with y=0)
        Then
            theta_x_1[j] = theta_j_1
            theta_x_0[j] = theta_j_0
        """
        theta_1 = num_y_1 / n
        theta_x_0 = np.full(m, k)
        theta_x_1 = np.full(m, k)
        
        for i in range(n):
            if y[i] == NEGATIVE:
                theta_x_0 += X[i]

            else: # y[i] == POSITIVE
                theta_x_1 += X[i]

        theta_x_0 = theta_x_0 / (num_y_0 + k + 1)
        theta_x_1 = theta_x_1 / (num_y_1 + k + 1)
        
        ones = np.full(m,1)
        
        """
        Define
            w_j_0 = log ((1 - theta_j_1) / (1 - theta_j_0))
            w_j_1 = log (theta_j_1 / theta_j_0)
        Then
            w_x_0[j] = x_j_0
            w_x_1[j] = x_j_1
        """
        w_x_0 = np.log(ones - theta_x_1) - np.log(ones - theta_x_0)
        w_x_1 = np.log(theta_x_1) - np.log(theta_x_0)
        
        """
        Define
            w_0 = log (P(y=1) / P(y=0)) + sum of w_j_0 for all j
            w = w_x_1 - w_x_0
        
        Then, for a given datapoint x, the log-odds ratio is:
            w_0 + (x.transpose * w)
        """
        w_0 = np.log(theta_1/(1 - theta_1)) + np.sum(w_x_0)
        w = w_x_1 - w_x_0
        
        self.w_0 = w_0
        self.w = w

        return self
    
    def predict(self, X):
        """
        Closed form solution for decision boundary.
        """
        n = X.shape[0]
        X = bin(X, threshold=self.binarize)
        w_0 = self.w_0
        w = self.w
        y_pred = np.full(n, w_0)
        
        for i, x in enumerate(X):
            y_pred[i] += x.dot(w.T)
        
        y_pred = (y_pred >= 0).astype(int).tolist()

        return y_pred

### Performance

The classifier will be tested with TF\*IDF, bigrams, and different binarization thresholds.

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline

pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', BernoulliNaiveBayes()),
])

pclf.fit(X_train, y_train)

In [0]:
from sklearn.model_selection import GridSearchCV

params = {
    "vect__ngram_range": [(1,1), (1,2), (2,2)],
    "clf__binarize": [.0, .1, 0.15],
}

grid_search = GridSearchCV(
    pclf,
    param_grid=params,
    cv=3, # 3-fold cross validation
    verbose=10,
    return_train_score=True,
)

grid_search.fit(X_train, y_train)

In [0]:
# Helper method to find the highest ranking models
# From: https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html
import numpy as np
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [0]:
# Top three models from randomized search and 2-fold cross validation
report(grid_search.cv_results_)

**Report:**  
Model with rank: 1  
Mean validation score: 0.871 (std: 0.007)  
Parameters: {'clf__binarize': 0.0, 'vect__ngram_range': (1, 2)}  

Model with rank: 2  
Mean validation score: 0.869 (std: 0.006)  
Parameters: {'clf__binarize': 0.0, 'vect__ngram_range': (2, 2)}  

Model with rank: 3  
Mean validation score: 0.851 (std: 0.004)  
Parameters: {'clf__binarize': 0.0, 'vect__ngram_range': (1, 1)}  

**Observations:**  
Bernoulli Naive Bayes achieves 85% accuracy with only TF\*IDF, and manages to reach 87% when bigrams are added.

## 2. Experimentation with two classifiers

### Feature construction

The features we'll use in our experiments are
1. TF\*IDF
2. L2 regularization
3. Sentiment lexicon, which will be our custom feature

#### Custom features: sentiment lexicon and Pooling

In [0]:
#@title Default title text
import nltk
from keras.layers.convolutional import Conv1D, MaxPooling1D
from sklearn.base import BaseEstimator, TransformerMixin

class SentimentLexicon(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        pass
    
    def transform(self, X):
        features = None
        
        return features
      

### Logistic Regression

In [0]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression as LR

pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', LR()),
])

pclf.fit(X_train, y_train)

# Train set score
y_train_pred = pclf.predict(X_train)
print(metrics.classification_report(
    y_train, y_train_pred))



              precision    recall  f1-score   support

           0       0.94      0.93      0.93     12500
           1       0.93      0.94      0.93     12500

   micro avg       0.93      0.93      0.93     25000
   macro avg       0.93      0.93      0.93     25000
weighted avg       0.93      0.93      0.93     25000



In [0]:
import pandas as pd
reports_path = project_dir / 'reports'

# Predict from test data
y_test_pred = pclf.predict(X_test)

# Export to CSV file
pd.DataFrame(y_test_pred, columns=['Category']).to_csv(reports_path / 'prediction_1.csv')

**Report:**  
Classifier: logistic regression  
Features: TF\*IDF and L2 regularization  
Kaggle score: 0.88186

#### Randomized search and cross validation

In [0]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    "vect__ngram_range": [(1,1),(1,2),(2,2)],
    "tfidf__use_idf": [True, False],
    "clf__C": [0.001, 0.01, 0.1, 1, 10, 100],
}

seed = 551

random_search = RandomizedSearchCV(
    pclf,
    param_distributions = params,
    cv=2, # 2-fold cross validation
    verbose = 10,
    random_state = seed,
    n_iter = 10,
    return_train_score=True,
)

random_search.fit(X_train, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.01 ......


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.01, score=0.8016, total=  23.5s
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.01 ......


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.4s remaining:    0.0s


[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.01, score=0.79216, total=  23.6s
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.001 .....


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.0min remaining:    0.0s


[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.001, score=0.79856, total=  22.7s
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.001 .....


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.6min remaining:    0.0s


[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.001, score=0.7868, total=  22.9s
[CV] vect__ngram_range=(1, 1), tfidf__use_idf=False, clf__C=0.1 ......


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.1min remaining:    0.0s


[CV]  vect__ngram_range=(1, 1), tfidf__use_idf=False, clf__C=0.1, score=0.77288, total=   6.1s
[CV] vect__ngram_range=(1, 1), tfidf__use_idf=False, clf__C=0.1 ......


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min remaining:    0.0s


[CV]  vect__ngram_range=(1, 1), tfidf__use_idf=False, clf__C=0.1, score=0.7612, total=   5.7s
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=False, clf__C=100 ......


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.4min remaining:    0.0s


[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=False, clf__C=100, score=0.88744, total=  40.7s
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=False, clf__C=100 ......


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.1min remaining:    0.0s


[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=False, clf__C=100, score=0.88216, total=  40.8s
[CV] vect__ngram_range=(1, 1), tfidf__use_idf=True, clf__C=100 .......


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.9min remaining:    0.0s


[CV]  vect__ngram_range=(1, 1), tfidf__use_idf=True, clf__C=100, score=0.88168, total=   7.8s
[CV] vect__ngram_range=(1, 1), tfidf__use_idf=True, clf__C=100 .......


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.1min remaining:    0.0s


[CV]  vect__ngram_range=(1, 1), tfidf__use_idf=True, clf__C=100, score=0.87576, total=   7.9s
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.1 .......
[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.1, score=0.83344, total=  23.2s
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.1 .......
[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.1, score=0.8216, total=  22.9s
[CV] vect__ngram_range=(2, 2), tfidf__use_idf=False, clf__C=100 ......
[CV]  vect__ngram_range=(2, 2), tfidf__use_idf=False, clf__C=100, score=0.8628, total=  26.0s
[CV] vect__ngram_range=(2, 2), tfidf__use_idf=False, clf__C=100 ......
[CV]  vect__ngram_range=(2, 2), tfidf__use_idf=False, clf__C=100, score=0.86464, total=  27.0s
[CV] vect__ngram_range=(1, 1), tfidf__use_idf=True, clf__C=0.01 ......
[CV]  vect__ngram_range=(1, 1), tfidf__use_idf=True, clf__C=0.01, score=0.78976, total=   5.8s
[CV] vect__ngram_range=(1, 1), tfidf__use_idf=True, clf__C=0.01 ......
[CV]  vec

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  7.8min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'vect__ngram_range': [(1, 1), (1, 2), (2, 2)], 'tfidf__use_idf': [True, False], 'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]},
          pre_dispatch='2*n_jobs', random_state=551, refit=True,
          return_train_score=True, scoring=None, verbose=10)

In [0]:
import pandas as pd
pd.DataFrame(random_search.cv_results_).sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__C,param_tfidf__use_idf,param_vect__ngram_range,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
3,34.04426,6.71129,0.8848,1.0,100.0,False,"(1, 2)","{'vect__ngram_range': (1, 2), 'tfidf__use_idf'...",1,0.88744,1.0,0.88216,1.0,0.066578,0.015775,0.00264,0.0
4,5.119068,2.7096,0.87872,1.0,100.0,True,"(1, 1)","{'vect__ngram_range': (1, 1), 'tfidf__use_idf'...",2,0.88168,1.0,0.87576,1.0,0.005182,0.02656,0.00296,0.0
9,15.929556,5.259638,0.87816,1.0,100.0,True,"(2, 2)","{'vect__ngram_range': (2, 2), 'tfidf__use_idf'...",3,0.87976,1.0,0.87656,1.0,0.115909,0.083641,0.0016,0.0
6,21.726679,4.777279,0.86372,1.0,100.0,False,"(2, 2)","{'vect__ngram_range': (2, 2), 'tfidf__use_idf'...",4,0.8628,1.0,0.86464,1.0,0.388198,0.154218,0.00092,0.0
5,15.515996,7.499169,0.82752,0.87564,0.1,True,"(1, 2)","{'vect__ngram_range': (1, 2), 'tfidf__use_idf'...",5,0.83344,0.87608,0.8216,0.8752,0.109473,0.266102,0.00592,0.00044
0,15.636838,7.882475,0.79688,0.8314,0.01,True,"(1, 2)","{'vect__ngram_range': (1, 2), 'tfidf__use_idf'...",6,0.8016,0.83328,0.79216,0.82952,0.016575,0.061777,0.00472,0.00188
1,15.041045,7.712979,0.79268,0.82916,0.001,True,"(1, 2)","{'vect__ngram_range': (1, 2), 'tfidf__use_idf'...",7,0.79856,0.83008,0.7868,0.82824,0.014449,0.112936,0.00588,0.00092
7,3.125948,2.650925,0.7844,0.796,0.01,True,"(1, 1)","{'vect__ngram_range': (1, 1), 'tfidf__use_idf'...",8,0.78976,0.79616,0.77904,0.79584,0.020187,0.027153,0.00536,0.00016
2,3.390597,2.50559,0.76704,0.77788,0.1,False,"(1, 1)","{'vect__ngram_range': (1, 1), 'tfidf__use_idf'...",9,0.77288,0.77632,0.7612,0.77944,0.157826,0.022387,0.00584,0.00156
8,2.83748,2.491178,0.67256,0.67504,0.001,False,"(1, 1)","{'vect__ngram_range': (1, 1), 'tfidf__use_idf'...",10,0.67952,0.67208,0.6656,0.678,0.023104,0.024526,0.00696,0.00296


In [0]:
# Top three models from randomized search and 2-fold cross validation

report(random_search.cv_results_)

Model with rank: 1
Mean validation score: 0.885 (std: 0.003)
Parameters: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__C': 100}

Model with rank: 2
Mean validation score: 0.879 (std: 0.003)
Parameters: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'clf__C': 100}

Model with rank: 3
Mean validation score: 0.878 (std: 0.002)
Parameters: {'vect__ngram_range': (2, 2), 'tfidf__use_idf': True, 'clf__C': 100}



**Report:**  
Model with rank: 1  
Mean validation score: 0.885 (std: 0.003)  
Parameters: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__C': 100}

Model with rank: 2  
Mean validation score: 0.879 (std: 0.003)  
Parameters: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'clf__C': 100}

Model with rank: 3  
Mean validation score: 0.878 (std: 0.002)  
Parameters: {'vect__ngram_range': (2, 2), 'tfidf__use_idf': True, 'clf__C': 100}

### Support Vector Machine

In [0]:
from sklearn.svm import SVC

pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', SVC(kernel='linear', cache_size=7000)),
])

pclf.fit(X_train, y_train)

y_train_pred = pclf.predict(X_train)
print(metrics.classification_report(
    y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96     12500
           1       0.96      0.97      0.96     12500

   micro avg       0.96      0.96      0.96     25000
   macro avg       0.96      0.96      0.96     25000
weighted avg       0.96      0.96      0.96     25000



In [0]:
# Output

"""
              precision    recall  f1-score   support

           0       0.97      0.96      0.96     12500
           1       0.96      0.97      0.96     12500

   micro avg       0.96      0.96      0.96     25000
   macro avg       0.96      0.96      0.96     25000
weighted avg       0.96      0.96      0.96     25000
"""

In [0]:
import pickle
models_path = project_dir / 'models'
pickle.dump(pclf, open(models_path / 'SVM.pkl', 'wb'))

NameError: ignored

#### Randomized search and cross validation

In [0]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

pclf = pickle.load(open(models_path / 'SVM.pkl', 'rb'))

params = {
    "vect__ngram_range": [(1,1),(1,2),(2,2)],
    "tfidf__use_idf": [True, False],
    "clf__C": [0.001, 0.01, 0.1, 1, 10, 100],
}

seed = 551

random_search = RandomizedSearchCV(
    pclf,
    param_distributions = params,
    cv=2, # 2-fold cross validation
    verbose = 5,
    random_state = seed,
    n_iter = 5,
    return_train_score=True,
)

random_search.fit(X_train, y_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.01 ......


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.01, score=0.7628, total=13.4min
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.01 ......


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 20.2min remaining:    0.0s


[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.01, score=0.72704, total=13.4min
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.001 .....


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 40.3min remaining:    0.0s


[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.001, score=0.7628, total=13.5min
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.001 .....


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 60.6min remaining:    0.0s


[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=True, clf__C=0.001, score=0.72704, total=13.4min
[CV] vect__ngram_range=(1, 1), tfidf__use_idf=False, clf__C=0.1 ......


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 80.8min remaining:    0.0s


[CV]  vect__ngram_range=(1, 1), tfidf__use_idf=False, clf__C=0.1, score=0.8004, total= 5.7min
[CV] vect__ngram_range=(1, 1), tfidf__use_idf=False, clf__C=0.1 ......
[CV]  vect__ngram_range=(1, 1), tfidf__use_idf=False, clf__C=0.1, score=0.78904, total= 5.6min
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=False, clf__C=100 ......
[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=False, clf__C=100, score=0.8864, total=16.6min
[CV] vect__ngram_range=(1, 2), tfidf__use_idf=False, clf__C=100 ......
[CV]  vect__ngram_range=(1, 2), tfidf__use_idf=False, clf__C=100, score=0.88312, total=16.0min
[CV] vect__ngram_range=(1, 1), tfidf__use_idf=True, clf__C=100 .......
[CV]  vect__ngram_range=(1, 1), tfidf__use_idf=True, clf__C=100, score=0.86728, total= 6.5min
[CV] vect__ngram_range=(1, 1), tfidf__use_idf=True, clf__C=100 .......
[CV]  vect__ngram_range=(1, 1), tfidf__use_idf=True, clf__C=100, score=0.86312, total= 6.5min


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 154.6min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'vect__ngram_range': [(1, 1), (1, 2), (2, 2)], 'tfidf__use_idf': [True, False], 'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]},
          pre_dispatch='2*n_jobs', random_state=551, refit=True,
          return_train_score=True, scoring=None, verbose=5)

In [0]:
report(random_search.cv_results_)

NameError: ignored

## Experimentation with feature extraction pipelines

Multiple features will be tested on logistic regression, all with 5-fold cross validation.

#### Only Bag of Words

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV

# Just BoW

pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LR(C=1e42)), # no regularization
])

pclf.fit(X_train, y_train)

params = {}

grid_search = GridSearchCV(
    pclf,
    param_grid=params,
    cv=5, # 5-fold cross validation
    verbose = 10,
    return_train_score=True,
)

grid_search.fit(X_train, y_train)

report(grid_search.cv_results_)



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................... , score=0.8774, total=  23.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   27.8s remaining:    0.0s


[CV] ................................... , score=0.8614, total=  21.5s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   53.2s remaining:    0.0s


[CV] .................................... , score=0.869, total=  22.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.3min remaining:    0.0s


[CV] .................................... , score=0.858, total=  21.6s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.7min remaining:    0.0s


[CV] ................................... , score=0.8714, total=  23.8s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished


Model with rank: 1
Mean validation score: 0.867 (std: 0.007)
Parameters: {}



**Report:**  
Model with rank: 1  
Mean validation score: 0.0.867 (std: 0.007)  
Parameters: Only BoW  

#### TF\*IDF

In [0]:
# TF*IDF

pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LR(C=1e42)), # no regularization
])

params = {
    "tfidf__use_idf": [True, False],
}

grid_search = GridSearchCV(
    pclf,
    param_grid=params,
    cv=5, # 5-fold cross validation
    verbose = 10,
    return_train_score=True,
)

grid_search.fit(X_train, y_train)

report(grid_search.cv_results_)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] tfidf__use_idf=True .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................. tfidf__use_idf=True, score=0.88, total=  12.4s
[CV] tfidf__use_idf=True .............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.4s remaining:    0.0s


[CV] ................ tfidf__use_idf=True, score=0.8668, total=  12.3s
[CV] tfidf__use_idf=True .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   32.7s remaining:    0.0s


[CV] ................. tfidf__use_idf=True, score=0.874, total=  12.2s
[CV] tfidf__use_idf=True .............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   49.0s remaining:    0.0s


[CV] ................ tfidf__use_idf=True, score=0.8636, total=  11.7s
[CV] tfidf__use_idf=True .............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s


[CV] ................ tfidf__use_idf=True, score=0.8698, total=  12.0s
[CV] tfidf__use_idf=False ............................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min remaining:    0.0s


[CV] ............... tfidf__use_idf=False, score=0.8766, total=  20.7s
[CV] tfidf__use_idf=False ............................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.8min remaining:    0.0s


[CV] ............... tfidf__use_idf=False, score=0.8688, total=  18.2s
[CV] tfidf__use_idf=False ............................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  2.1min remaining:    0.0s


[CV] ............... tfidf__use_idf=False, score=0.8732, total=  19.1s
[CV] tfidf__use_idf=False ............................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  2.5min remaining:    0.0s


[CV] ................ tfidf__use_idf=False, score=0.864, total=  17.5s
[CV] tfidf__use_idf=False ............................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.9min remaining:    0.0s


[CV] ............... tfidf__use_idf=False, score=0.8706, total=  18.2s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.2min finished


Model with rank: 1
Mean validation score: 0.871 (std: 0.006)
Parameters: {'tfidf__use_idf': True}

Model with rank: 2
Mean validation score: 0.871 (std: 0.004)
Parameters: {'tfidf__use_idf': False}



**Report:**  
Model with rank: 1  
Mean validation score: 0.871 (std: 0.006)  
Parameters: {'tfidf__use_idf': True}  

Model with rank: 2  
Mean validation score: 0.871 (std: 0.004)  
Parameters: {'tfidf__use_idf': False}  

#### TF\*IDF and Bigrams

In [0]:
# Bigrams with TF*IDF

pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LR(C=1e42)), # no regularization
])

params = {
    "vect__ngram_range": [(1,2)], # bigrams
    "tfidf__use_idf": [True],
}

grid_search = GridSearchCV(
    pclf,
    param_grid=params,
    cv=5, # 5-fold cross validation
    verbose = 10,
    return_train_score=True,
)

grid_search.fit(X_train, y_train)

report(grid_search.cv_results_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] tfidf__use_idf=True, vect__ngram_range=(1, 2) ...................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.915, total=  42.5s
[CV] tfidf__use_idf=True, vect__ngram_range=(1, 2) ...................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   54.3s remaining:    0.0s


[CV]  tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.9112, total=  42.3s
[CV] tfidf__use_idf=True, vect__ngram_range=(1, 2) ...................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s


[CV]  tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.9104, total=  43.1s
[CV] tfidf__use_idf=True, vect__ngram_range=(1, 2) ...................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.8min remaining:    0.0s


[CV]  tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.9006, total=  48.9s
[CV] tfidf__use_idf=True, vect__ngram_range=(1, 2) ...................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.8min remaining:    0.0s


[CV]  tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.9076, total=  47.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.8min finished


Model with rank: 1
Mean validation score: 0.909 (std: 0.005)
Parameters: {'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}



In [0]:
import pandas as pd
reports_path = project_dir / 'reports'

# Predict from test data
y_test_pred = grid_search.predict(X_test)

# Export to CSV file
pd.DataFrame(y_test_pred, columns=['Category']).to_csv(reports_path / 'prediction_5.csv')

**Report:**  
Model with rank: 1  
Mean validation score: 0.909 (std: 0.005)  
Parameters: {'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}  



#### TF\*IDF, Bigrams, L2 Regularization

In [0]:
# Bigrams, TF*IDF and L2 regularization

pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LR(penalty='l2')), # L2 regularization
])

params = {
    "vect__ngram_range": [(1,2)], # bigrams
    "tfidf__use_idf": [True],
    "clf__C": [1,10,100]
}

grid_search = GridSearchCV(
    pclf,
    param_grid=params,
    cv=5, # 5-fold cross validation
    verbose = 10,
    return_train_score=True,
)

grid_search.fit(X_train, y_train)

report(grid_search.cv_results_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8958, total=  32.1s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   45.3s remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8868, total=  32.4s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8858, total=  32.2s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.3min remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8822, total=  32.0s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.0min remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8866, total=  32.2s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.8min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.9088, total=  39.5s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  4.7min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.906, total=  39.3s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  5.5min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.9002, total=  36.6s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  6.4min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8982, total=  38.4s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  7.2min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.9032, total=  38.3s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.91, total=  42.0s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.9094, total=  40.6s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.9036, total=  40.2s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.9012, total=  37.5s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.9078, total=  39.3s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 12.4min finished


Model with rank: 1
Mean validation score: 0.906 (std: 0.003)
Parameters: {'clf__C': 100, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

Model with rank: 2
Mean validation score: 0.903 (std: 0.004)
Parameters: {'clf__C': 10, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

Model with rank: 3
Mean validation score: 0.887 (std: 0.004)
Parameters: {'clf__C': 1, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}



**Report:**  
Model with rank: 1  
Mean validation score: 0.906 (std: 0.003)  
Parameters: {'clf__C': 100, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}  

Model with rank: 2  
Mean validation score: 0.903 (std: 0.004)  
Parameters: {'clf__C': 10, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}  

Model with rank: 3  
Mean validation score: 0.887 (std: 0.004)  
Parameters: {'clf__C': 1, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}  

#### TF\*IDF, Bigrams, L1 Regularization

In [0]:
# Bigrams, TF*IDF and L1 regularization

pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LR(penalty='l1')), # L1 regularization
])

params = {
    "vect__ngram_range": [(1,2)], # bigrams
    "tfidf__use_idf": [True],
    "clf__C": [1,10,100]
}

grid_search = GridSearchCV(
    pclf,
    param_grid=params,
    cv=5, # 5-fold cross validation
    verbose = 10,
    return_train_score=True,
)

grid_search.fit(X_train, y_train)

report(grid_search.cv_results_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8686, total=  30.4s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   42.3s remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8634, total=  31.0s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8634, total=  30.7s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.1min remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.858, total=  30.8s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.9min remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.863, total=  30.6s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.6min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8926, total=  32.4s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  4.3min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8908, total=  33.6s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  5.1min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8838, total=  33.2s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  5.8min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8828, total=  32.0s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  6.5min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8896, total=  32.9s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8926, total=  29.5s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.887, total=  30.0s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8786, total=  31.0s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8816, total=  30.5s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8864, total=  31.3s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 10.9min finished


Model with rank: 1
Mean validation score: 0.888 (std: 0.004)
Parameters: {'clf__C': 10, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

Model with rank: 2
Mean validation score: 0.885 (std: 0.005)
Parameters: {'clf__C': 100, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

Model with rank: 3
Mean validation score: 0.863 (std: 0.003)
Parameters: {'clf__C': 1, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}



**Report:**  
Model with rank: 1  
Mean validation score: 0.888 (std: 0.004)  
Parameters: {'clf__C': 10, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

Model with rank: 2  
Mean validation score: 0.885 (std: 0.005)  
Parameters: {'clf__C': 100, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

Model with rank: 3  
Mean validation score: 0.863 (std: 0.003)  
Parameters: {'clf__C': 1, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

#### TF\*IDF, Bigrams, L1 + L2 Regularization

In [0]:
# Bigrams, TF*IDF and L1 + L2 regularization

from sklearn.preprocessing import Normalizer

pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer(norm='l1')), # L1 regularization
    ('clf', LR(penalty='l2')), # L2 regularization
])

params = {
    "vect__ngram_range": [(1,2)], # bigrams
    "tfidf__use_idf": [True],
    "clf__C": [1,10,100]
}

grid_search = GridSearchCV(
    pclf,
    param_grid=params,
    cv=5, # 5-fold cross validation
    verbose = 10,
    return_train_score=True,
)

grid_search.fit(X_train, y_train)

report(grid_search.cv_results_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8116, total=  25.3s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.5s remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8048, total=  25.9s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8, total=  24.7s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.7984, total=  24.9s
[CV] clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2) .........


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.5min remaining:    0.0s


[CV]  clf__C=1, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8008, total=  24.4s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.1min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8384, total=  25.2s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.7min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8282, total=  27.1s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  4.3min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8232, total=  26.5s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  5.0min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8262, total=  26.5s
[CV] clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2) ........


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.6min remaining:    0.0s


[CV]  clf__C=10, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8268, total=  26.9s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8812, total=  30.2s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8694, total=  29.8s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.87, total=  30.4s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.864, total=  30.1s
[CV] clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2) .......
[CV]  clf__C=100, tfidf__use_idf=True, vect__ngram_range=(1, 2), score=0.8686, total=  29.7s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  9.8min finished


Model with rank: 1
Mean validation score: 0.871 (std: 0.006)
Parameters: {'clf__C': 100, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

Model with rank: 2
Mean validation score: 0.829 (std: 0.005)
Parameters: {'clf__C': 10, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

Model with rank: 3
Mean validation score: 0.803 (std: 0.005)
Parameters: {'clf__C': 1, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}



**Report:**  
Model with rank: 1  
Mean validation score: 0.871 (std: 0.006)  
Parameters: {'clf__C': 100, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}  

Model with rank: 2  
Mean validation score: 0.829 (std: 0.005)  
Parameters: {'clf__C': 10, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}  

Model with rank: 3  
Mean validation score: 0.803 (std: 0.005)  
Parameters: {'clf__C': 1, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}  


#### Observations

Logistic regression's performance increased in the following order of features:
1. Bag of words: 0.867
2. TF\*IDF + bigrams + L1 + L2 regularization: 0.871
3. TF\*IDF alone: 0.871
4. TF\*IDF + bigrams + L1 regularization: 0.888
6. TF\*IDF + bigrams + L2 regularization: 0.906
6. TF\*IDF + bigrams without regularization: 0.909

## Final cross validation grid search

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Bigrams with TF*IDF

pclf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', LR(C=1e42)),
])

grid_search = GridSearchCV(
    pclf,
    param_grid={},
    cv=10, # 10-fold cross validation
    verbose = 10,
    return_train_score=True,
)

grid_search.fit(X_train, y_train)

report(grid_search.cv_results_)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................................... , score=0.914, total=  47.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.0min remaining:    0.0s


[CV] ................................... , score=0.9176, total=  46.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.0min remaining:    0.0s


[CV] ................................... , score=0.9096, total=  48.6s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.0min remaining:    0.0s


[CV] ................................... , score=0.9108, total=  47.5s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.0min remaining:    0.0s


[CV] ................................... , score=0.9104, total=  51.6s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.2min remaining:    0.0s


[CV] ................................... , score=0.9056, total=  52.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  6.3min remaining:    0.0s


[CV] .................................... , score=0.898, total=  47.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  7.3min remaining:    0.0s


[CV] .................................... , score=0.912, total=  48.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  8.4min remaining:    0.0s


[CV] .................................... , score=0.912, total=  55.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  9.6min remaining:    0.0s


[CV] ................................... , score=0.9148, total=  55.7s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 10.7min finished


Model with rank: 1
Mean validation score: 0.910 (std: 0.005)
Parameters: {}



In [0]:
# Output

"""
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV]  ................................................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
[CV] .................................... , score=0.914, total=  47.2s
[CV]  ................................................................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.0min remaining:    0.0s
[CV] ................................... , score=0.9176, total=  46.1s
[CV]  ................................................................
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.0min remaining:    0.0s
[CV] ................................... , score=0.9096, total=  48.6s
[CV]  ................................................................
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.0min remaining:    0.0s
[CV] ................................... , score=0.9108, total=  47.5s
[CV]  ................................................................
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.0min remaining:    0.0s
[CV] ................................... , score=0.9104, total=  51.6s
[CV]  ................................................................
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.2min remaining:    0.0s
[CV] ................................... , score=0.9056, total=  52.0s
[CV]  ................................................................
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  6.3min remaining:    0.0s
[CV] .................................... , score=0.898, total=  47.8s
[CV]  ................................................................
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  7.3min remaining:    0.0s
[CV] .................................... , score=0.912, total=  48.7s
[CV]  ................................................................
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  8.4min remaining:    0.0s
[CV] .................................... , score=0.912, total=  55.9s
[CV]  ................................................................
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  9.6min remaining:    0.0s
[CV] ................................... , score=0.9148, total=  55.7s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 10.7min finished
Model with rank: 1
Mean validation score: 0.910 (std: 0.005)
Parameters: {}
"""

**Report:**  
Model with rank: 1  
Mean validation score: 0.910 (std: 0.005)  
Parameters: TF\*IDF, bigrams, no regularization

In [0]:
from sklearn.model_selection import GridSearchCV

params = {
    "vect__ngram_range": [(1,1),(1,2),(2,2),(1,3),(1,4)],
    "tfidf__use_idf": [True, False],
    "clf__C": [0.001, 0.01, 0.1, 1, 10, 100],
}

grid_search = GridSearchCV(
    pclf,
    param_grid=params,
    cv=5, # 5-fold cross validation
    verbose = 10,
    return_train_score=True,
)

grid_search.fit(X_train, y_train)

In [0]:
from pandas import DataFrame
df = DataFrame(grid_search.cv_results_).sort_values(by=['rank_test_score'])
print (df.to_latex())

NameError: ignored

In [0]:
# Top three models from randomized search and 5-fold cross validation

report(grid_search.cv_results_)

Report:  
Model with rank: 1  
Mean validation score: 0.906 (std: 0.003)  
Parameters: {'clf__C': 100, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}  

Model with rank: 2  
Mean validation score: 0.903 (std: 0.004)  
Parameters: {'clf__C': 10, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}  

Model with rank: 3  
Mean validation score: 0.902 (std: 0.004)  
Parameters: {'clf__C': 100, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 3)}  

Kaggle score (model rank 1): 0.90413

In [0]:
reports_path = project_dir / 'reports'

# Predict from test data
y_test_pred = grid_search.predict(X_test)

# Export to CSV file
pd.DataFrame(y_test_pred, columns=['Category']).to_csv(reports_path / 'prediction_4.csv')

In [0]:
from sklearn.svm import SVC

pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', SVC(cache_size=7000)),
])

pclf.fit(X_train, y_train)

In [0]:
from sklearn.model_selection import GridSearchCV

params = {
    "vect__ngram_range": [(1,1),(1,2),(1,3),(1,4)],
    "clf__kernel": [
        'linear',
        #'sigmoid',
    ],
    "clf__gamma": [
        #'auto',
        'scale',
    ]
}

grid_search = GridSearchCV(
    pclf,
    param_grid=params, # For grid search
    cv=2, # 2-fold cross validation
    verbose = 10,
    return_train_score=True,
)

grid_search.fit(X_train, y_train)

In [0]:
"""
Fitting 2 folds for each of 16 candidates, totalling 32 fits
[CV] clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 1) ...
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV]  clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 1), score=0.8852, total= 3.5min
[CV] clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 1) ...
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.4min remaining:    0.0s
[CV]  clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 1), score=0.88056, total= 3.5min
[CV] clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 2) ...
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 10.6min remaining:    0.0s
[CV]  clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 2), score=0.8988, total= 9.8min
[CV] clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 2) ...
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 25.0min remaining:    0.0s
[CV]  clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 2), score=0.89264, total= 9.5min
[CV] clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 3) ...
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 38.8min remaining:    0.0s
[CV]  clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 3), score=0.89536, total=13.2min
[CV] clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 3) ...
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 58.2min remaining:    0.0s
[CV]  clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 3), score=0.89048, total=13.0min
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 77.5min remaining:    0.0s
[CV] clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 4) ...
[CV]  clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 4), score=0.89264, total=16.7min
[CV] clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 4) ...
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 102.7min remaining:    0.0s
[CV]  clf__gamma=auto, clf__kernel=linear, vect__ngram_range=(1, 4), score=0.88648, total=16.5min
[CV] clf__gamma=auto, clf__kernel=sigmoid, vect__ngram_range=(1, 1) ..
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 127.6min remaining:    0.0s
[CV]  clf__gamma=auto, clf__kernel=sigmoid, vect__ngram_range=(1, 1), score=0.64816, total= 6.8min
[CV] clf__gamma=auto, clf__kernel=sigmoid, vect__ngram_range=(1, 1) ..
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 137.7min remaining:    0.0s
"""

**Observations:**  
The score dropped drastically when testing with a sigmoid kernel. Will now only be testing with linear kernel.

In [0]:
report(grid_search.cv_results_)