In [1]:
from __future__ import division
from __future__ import print_function
import sklearn 
import numpy as np
import scipy.sparse as sp
import pandas as pd


In [2]:
from sklearn.model_selection import train_test_split

with open('rotten_imdb.tar/plot.5000', 'r') as f:
    plot_data = f.read().splitlines() 
    
len(plot_data)
plot_label = ['objective'] * len(plot_data)

with open('rotten_imdb.tar/quote.5000') as f:
    quote_data =f.read().splitlines() 
len(quote_data)
quote_label = ['subjective'] * len(quote_data)

data = plot_data + quote_data
label = np.array(plot_label + quote_label)
data = np.array([d.decode('utf-8', 'ignore') for d in data])

N = data.shape[0]

data_train, data_test, label_train, label_test = train_test_split(
    data, label, test_size=0.2, random_state=23)


#### Constructing features

In [8]:
# custom transformer
# Pos tagging count 
import nltk
from sklearn.base import TransformerMixin, BaseEstimator
from nltk import pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet
from nltk.corpus import sentiwordnet as swn

class POSCountAndObjScoreTransformer(BaseEstimator, TransformerMixin):

    TAGS =  ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ'
             , 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS'
             , 'NNP', 'NNPS', 'PDT', 'POS', 'PRP'
             , 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM'
             , 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP'
             , 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
    
    def __init__(self, normalize=True, pos_tags=None):
        if pos_tags:
            self.selected_tags = [t for t in pos_tags if t in self.TAGS]
        else:
            self.selected_tags = self.TAGS
        self.selected_tags_dict = {k: i for i, k in enumerate(self.selected_tags)}
        self.normalize = normalize

    def _to_wordnet_tag(self, nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''
        
    def transform(self, X):
        """
        Parameters
        ----------
            X: A list of string of sentences
        Returns
        ----------
            A 2-d array with ...
        """
        # Ignore punctuation
        tokenizer = RegexpTokenizer(r'\w+')
        # POS counts and objective score
        result = np.zeros((len(X), len(self.selected_tags) + 1))
        for i, sent in enumerate(X):
            words = tokenizer.tokenize(sent)
            tagged = nltk.pos_tag(words)
            for t in tagged:
                if t[1] in self.selected_tags_dict:
                    result[i, self.selected_tags_dict[t[1]]] += 1
                    wordnet_tag = self._to_wordnet_tag(t[1])
                    senti = swn.senti_synsets(t[0], wordnet_tag)
                    if senti:
                        result[i, -1] += senti[0].obj_score()

            if self.normalize:
                result[i] /= len(words)

        return sp.csr_matrix(result)
        
    def fit(self, x, y=None):
        return self


In [13]:
sentences = ['today is a good day', 'happy we go, ! !!']
s = POSCountAndObjScoreTransformer(pos_tags= ['JJ', 'JJR', 'JJS']).transform(data_train).todense()
s[:10]

matrix([[ 0.13793103,  0.        ,  0.        ,  0.03448276],
        [ 0.03333333,  0.        ,  0.        ,  0.01666667],
        [ 0.10344828,  0.        ,  0.        ,  0.06896552],
        [ 0.15789474,  0.        ,  0.        ,  0.05263158],
        [ 0.11111111,  0.03703704,  0.        ,  0.06944444],
        [ 0.28571429,  0.        ,  0.14285714,  0.21428571],
        [ 0.07142857,  0.        ,  0.        ,  0.01785714],
        [ 0.13888889,  0.        ,  0.        ,  0.13888889],
        [ 0.16666667,  0.        ,  0.        ,  0.0625    ],
        [ 0.33333333,  0.        ,  0.        ,  0.19444444]])

In [19]:
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
import string 

from sklearn.base import BaseEstimator, TransformerMixin

class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
        self.lower = lower
        self.strip = strip
        self.stopwords = stopwords or set(sw.words('english'))
        self.punct = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()
        
    def fit(self, X, y=None):
        return self

    def inverse_transformation(self, X):
        return [" ".join(doc) for doc in X]
    
    def transform(self, X):
        self.counter = 0
        return [' '.join(list(self.tokenize(sent))) for sent in X]
    
    def tokenize(self, sent):
        self.counter += 1
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            # apply preprocessing to the token
            token = token.lower() if self.lower else token
            token = token.strip() if self.strip else token
            token = token.strip('_') if self.strip else token
            token = token.strip('*') if self.strip else token

            # if stopword, ignore token and continue
            if token in self.stopwords:
                continue

            # if puncutation, ignore token and continue
            if all(char in self.punct for char in token):
                continue

            # Lemmatize the token and yield
            lemma = self.lemmatize(token, tag)
            yield lemma
    
    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        
        return self.lemmatizer.lemmatize(token, tag)
                                                   

In [20]:
preprocessor = NLTKPreprocessor()
new_data_train = preprocessor.transform(data_train)

#### Building a pipeline

In [14]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

selected_tags = ['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS', 'RP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

features =  ('features', FeatureUnion([
                ('tfidf', TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 3), max_features=10000000))
                , ('pos', POSCountAndObjScoreTransformer(pos_tags=selected_tags))])
            )

bayes_pipeline = Pipeline([
    features
    , ('clf', MultinomialNB())
])

svm_pipeline = Pipeline([
    features
    , ('clf', SVC())
])

In [None]:
SGDClassifier(loss='hinge', penalty='12', shuffle=True)

OnveVsRestClassifier()

CalibratedClassifier()



#### Search for the best model

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [16]:
# Bayes grid search

selected_tags_param = [
    ['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS', 'RP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    , ['JJ', 'JJR', 'JJS']
    , []]

params_grid = [
    {'features__pos__pos_tags': selected_tags_param}
]

bayes_gs = GridSearchCV(bayes_pipeline, params_grid)
bayes_gs.fit(data_train, label_train)


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=None, min_df=2,
        ngr...    transformer_weights=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'features__pos__pos_tags': [['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS', 'RP', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'], ['JJ', 'JJR', 'JJS'], []]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [17]:
# SVM grid search

params_grid = [
    {'clf__C': [1], 'clf__kernel': ['linear']},
    {'clf__C': [1], 'clf__kernel': ['rbf'], 'clf__gamma': [0.001, 0.0001]}
]

svm_gs = GridSearchCV(svm_pipeline, params_grid)
svm_gs.fit(data_train, label_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=None, min_df=2,
        ngr...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'clf__C': [1], 'clf__kernel': ['linear']}, {'clf__gamma': [0.001, 0.0001], 'clf__C': [1], 'clf__kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [18]:
from sklearn.metrics import classification_report

bayes_results = pd.DataFrame(bayes_gs.cv_results_)
bayes_results

svm_results = pd.DataFrame(svm_gs.cv_results_)
svm_results

# print("Detailed classification report:")
# print()
# print("The model is trained on the full development set.")
# print("The scores are computed on the full evaluation set.")
# print()
# y_true, y_pred = y_test, bayes_gs.predict(X_test)
# print(classification_report(y_true, y_pred))
# print


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__C,param_clf__gamma,param_clf__kernel,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,21.700667,10.174667,0.910625,0.991625,1,,linear,"{u'clf__C': 1, u'clf__kernel': u'linear'}",1,0.913418,0.99156,0.911853,0.991189,0.906602,0.992126,1.195918,0.931347,0.002915315,0.0003854017
1,26.703,12.159,0.5015,0.5015,1,0.001,rbf,"{u'clf__gamma': 0.001, u'clf__C': 1, u'clf__ke...",2,0.501499,0.5015,0.5015,0.5015,0.5015,0.5015,0.99171,0.608156,5.302638e-07,2.651982e-07
2,26.323333,13.213333,0.5015,0.5015,1,0.0001,rbf,"{u'clf__gamma': 0.0001, u'clf__C': 1, u'clf__k...",2,0.501499,0.5015,0.5015,0.5015,0.5015,0.5015,0.936183,0.855864,5.302638e-07,2.651982e-07
