## Predicting Ratings from reviews

In [4]:
import nltk
#nltk.download()

In [3]:
import numpy as np
import pandas as pd
import os.path
import pickle

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

# load data, format specific to NB
with open('amazonbook.p', 'rb') as f:
    amazonbook = pickle.load(f)

train = {'data': amazonbook['train']['review'], 'target': amazonbook['train']['rating']}
test = {'data': amazonbook['test']['review'], 'target': amazonbook['test']['rating']}

print train['data'].shape
print train['target'].values-1

(182451L,)
[ 3.  4.  3. ...,  4.  3.  1.]


In [32]:
from sklearn.base import BaseEstimator, ClassifierMixin, clone
def zeroOne(y,a) :
    '''
    Computes the zero-one loss.
    @param y: output class
    @param a: predicted class
    @return 1 if different, 0 if same
    '''
    return int(y != a)

def featureMap(X,y,num_classes) :
    '''
    Computes the class-sensitive features.
    @param X: array-like, shape = [n_samples,n_inFeatures] or [n_inFeatures,], input features for input data
    @param y: a target class (in range 0,..,num_classes-1)
    @return array-like, shape = [n_samples,n_outFeatures], the class sensitive features for class y
    '''
    #The following line handles X being a 1d-array or a 2d-array
    num_samples, num_inFeatures = (1,X.shape[0]) if len(X.shape) == 1 else (X.shape[0],X.shape[1])
    #your code goes here, and replaces following return
    num_outFeatures = num_classes*num_inFeatures
    #feature_map = np.zeros(num_outFeatures)
    featurized_X = np.zeros(num_samples*num_outFeatures).reshape(num_samples,num_outFeatures)
    #Create a method for num_samples == 1
    if num_samples == 1:
        feature_map = np.zeros(num_outFeatures)
        feature_map[y*num_inFeatures:(y+1)*num_inFeatures]=X
        return feature_map
    # Compute feature for each sample
    for idx,sample in enumerate(X):
        yi = y[idx]
        feature_map = np.zeros(num_outFeatures)
        feature_map[yi*num_inFeatures:(yi+1)*num_inFeatures]=sample
        featurized_X[idx,:] = feature_map
    return featurized_X

def sgd(X, y, num_outFeatures, subgd, eta = 0.1, T = 1):
    '''
    Runs subgradient descent, and outputs resulting parameter vector.
    @param X: array-like, shape = [n_samples,n_features], input training data 
    @param y: array-like, shape = [n_samples,], class labels
    @param num_outFeatures: number of class-sensitive features
    @param subgd: function taking x,y and giving subgradient of objective
    @param eta: learning rate for SGD
    @param T: maximum number of iterations
    @return: vector of weights
    '''
    num_samples = X.shape[0]
    #your code goes here and replaces following return statement
    # initilize w
    decay = 1
    w = np.zeros(num_outFeatures)
    for t in range(T):
        #caluclate subgradient
        for idx,xi in enumerate(X):
            eta = eta/decay
            decay = decay + 1
            sg =subgd(xi,y[idx],w) 
            w = w - eta*sg
    return w

class MulticlassSVM(BaseEstimator, ClassifierMixin):
    '''
    Implements a Multiclass SVM estimator.
    '''
    def __init__(self, num_outFeatures, lam=1.0, num_classes=5, Delta=zeroOne, Psi=featureMap):       
        '''
        Creates a MulticlassSVM estimator.
        @param num_outFeatures: number of class-sensitive features produced by Psi
        @param lam: l2 regularization parameter
        @param num_classes: number of classes (assumed numbered 0,..,num_classes-1)
        @param Delta: class-sensitive loss function taking two arguments (i.e., target margin)
        @param Psi: class-sensitive feature map taking two arguments
        '''
        self.num_outFeatures = num_outFeatures
        self.lam = lam
        self.num_classes = num_classes
        self.Delta = Delta
        self.Psi = lambda X,y : Psi(X,y,num_classes)
        self.fitted = False
    
    def subgradient(self,x,y,w):
        '''
        Computes the subgradient at a given data point x,y
        @param x: sample input
        @param y: sample class
        @param w: parameter vector
        @return returns subgradient vector at given x,y,w
        '''
        #Your code goes here and replaces the following return statement
        # Compute (class-sensitive-loss + margin) for each class
        si = self.Psi
        loss  = [self.Delta(y,cls) + np.dot(w, (si(x,cls) - si(x,y))) for cls in range(self.num_classes)] 
        # get the class which maximizes loss(so that we have an upper bound on o/1 loss)
        y_opt = np.argmax(loss)
        # Using graddient expression derived in 3.3
        return 2*self.lam*w + si(x,y_opt)-si(x,y)

    def fit(self,X,y,eta=0.1,T=1):
        '''
        Fits multiclass SVM
        @param X: array-like, shape = [num_samples,num_inFeatures], input data
        @param y: array-like, shape = [num_samples,], input classes
        @param eta: learning rate for SGD
        @param T: maximum number of iterations
        @return returns self
        '''
        self.coef_ = sgd(X,y,self.num_outFeatures,self.subgradient,eta,T)
        self.fitted = True
        return self
    
    def decision_function(self, X):
        '''
        Returns the score on each input for each class. Assumes
        that fit has been called.
        @param X : array-like, shape = [n_samples, n_inFeatures]
        @return array-like, shape = [n_samples, n_classes] giving scores for each sample,class pairing
        '''
        if not self.fitted:
            raise RuntimeError("You must train classifer before predicting data.")
        print(X.shape)
        #Your code goes here and replaces following return statement
        # Initialize the score_matrix of appropriate dimensions
        score_matrix = np.zeros(len(X)*self.num_classes).reshape(len(X), self.num_classes)
        # Compute score for each sample and for each class
        for i,x_i in enumerate(X):
            score_matrix[i,:] = [np.dot(self.coef_, self.Psi(x_i,cls)) for cls in range(self.num_classes)]
        return score_matrix
            
    def predict(self, X):
        '''
        Predict the class with the highest score.
        @param X: array-like, shape = [n_samples, n_inFeatures], input data to predict
        @return array-like, shape = [n_samples,], class labels predicted for each data point
        '''

        #Your code goes here and replaces following return statement
        pred = np.zeros(X.shape[0])
        score_matrix = self.decision_function(X)
        for idx,row in enumerate(score_matrix):
            pred[idx] = np.argmax(row)
        return pred

In [33]:
# Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(train['data'])
print X_train_count.shape # 841,4900
# Working on Improving the features
# Better than counts are the frequencies of occurence of words
# Better than frequencies are tF-idf(term frequency times inverse Document Frequency)
# Count can be converted to tf-idf with standard sklearn package
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
print(X_train_tfidf.shape)

X_test_count = count_vect.fit_transform(test['data'])
X_test_tfidf = tfidf_transformer.fit_transform(X_test_count)
print X_test_tfidf.shape






(81238, 40159)
(81238, 40159)
(27080, 24373)


In [34]:
est = MulticlassSVM(200795,lam=1)
#X_train = (X - np.mean(X,axis=0))/np.std(X,axis=0)
y=  train['target'].values-1
est.fit(X_train_tfidf,y)
print("w:")
print(est.coef_)
#Z = est.predict(X_test)
#print Z

ValueError: setting an array element with a sequence.

#### OneVsRest Classifier with LinearSVC

In [5]:
from nltk import word_tokenize       
from sklearn import svm
from nltk.stem import WordNetLemmatizer 
class Tokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

# Tokenize the text
t = Tokenizer()

# Setting up the pipeline
text_clf = Pipeline([('vect', TfidfVectorizer(tokenizer=t, ngram_range=(1, 2), binary=True)),
                     ('clf', OneVsRestClassifier(svm.LinearSVC(loss='hinge', fit_intercept=False, C=0.1))),
])

# train the model
text_clf = text_clf.fit(train['data'], train['target'])

#predict
predicted = text_clf.predict(test['data'])
#print type(predicted)
#print test['target'].values
print np.mean(predicted == test['target'].values)
#print metrics.classification_report(test['target'], predicted)

0.538541221349


In [13]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC

OneVsOneClassifier(LinearSVC(random_state=0))

text_clf2 = Pipeline([('vect', TfidfVectorizer(tokenizer=t, ngram_range=(1, 2), binary=True)),
                      ('clf', OneVsOneClassifier(svm.LinearSVC(random_state=0)))])

text_clf2 = text_clf2.fit(train['data'], train['target'])

predicted = text_clf2.predict(test['data'])
#print type(predicted)
#print test['target'].values
print np.mean(predicted == test['target'].values)


0.513449965471


In [12]:
from sklearn.linear_model import LogisticRegression
text_clf3 = Pipeline([('vect', TfidfVectorizer(tokenizer = t, ngram_range=(1,2), binary = True)),('clf', LogisticRegression(multi_class = 'ovr'))])

text_clf3 = text_clf3.fit(train['data'], train['target'])

predicted3 = text_clf3.predict(test['data'])

print np.mean(predicted3==test['target'].values)

0.541533756454
