In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline

import pickle
from nltk.corpus import movie_reviews

In [2]:
from sklearn.externals import joblib

In [3]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

##### function that returns trained classifier object 

In [4]:
def my_classifier(text, label):
    #prod_sentim_train = pd.read_csv("products_sentiment_train.tsv", names = ["text", "label"], header = 0, sep="\t")

    #zero_test_ind = prod_sentim_train[prod_sentim_train["label"] == 0]
    #new_prod_sentim_train = shuffle(prod_sentim_train.append(prod_sentim_train.ix[list(zero_test_ind.index[:500])]), random_state=10)

    def text_classifier(vectorizer, transformer, classifier):
        return Pipeline(
                [("vectorizer", vectorizer),
                ("transformer", transformer),
                ("classifier", classifier)]
            )

    Lin_SVC = text_classifier(vectorizer=CountVectorizer(min_df=1, ngram_range=(1, 3), max_df=0.85, stop_words=None), 
                    transformer=TfidfTransformer(), 
                    classifier=LinearSVC(max_iter=400, loss='squared_hinge', C= 1.1, tol=1e-05, random_state=1))

    Lin_SVC.fit(text, label)

    return Lin_SVC
    #lin_svc_result = Lin_SVC.predict(prod_sentim_test["text"])
    

In [5]:
cls = my_classifier(texts, labels)
print(cls)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.85, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
   ..._hinge', max_iter=400,
     multi_class='ovr', penalty='l2', random_state=1, tol=1e-05, verbose=0))])


##### use pickle lib to save to file trained classifier object

In [6]:
#write to file serilized object to file; protocol=3 for python 3
with open('dumped_lin_SVC_classifier.pkl', 'wb') as fl:
    pickle.dump(cls, fl)

In [7]:
model = joblib.load("dumped_lin_SVC_classifier.pkl")

In [8]:
print(model)

Pipeline(steps=[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.85, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
   ..._hinge', max_iter=400,
     multi_class='ovr', penalty='l2', random_state=1, tol=1e-05, verbose=0))])


In [6]:
prod_sentim_test = pd.read_csv("./data/products_sentiment_test_copy.tsv", header = 0, sep="\t")

In [9]:
print(prod_sentim_test.head())
#prod_sentim_test["text"].iloc[4]

   Id                                               text
0   0  so , why the small digital elph , rather than ...
1   1  3/4 way through the first disk we played on it...
2   2  better for the zen micro is outlook compatibil...
3   3    6 . play gameboy color games on it with goboy .
4   4  likewise , i 've heard norton 2004 professiona...


In [10]:
result = model.predict(prod_sentim_test["text"])

In [11]:
print(result)

[1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1
 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 1 1 1 1 0 0 0 0 0 1 1 1 0 1 0
 1 0 0 0 0 0 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 0 1 1 0 1 1 0 1 1 1 1 1
 1 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 1 0 0
 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 0 0 1 1 1 0 1 1 0 1 0 1 0 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 0 1 1
 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 0
 1 1 0 0 1 0 0 0 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 0
 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0
 1 1 1 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1
 0 1 0 1 1 1 0 0 0 1 0 0 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 0 0 1
 1 1 0 1 1 1 1 1 1 1 0 0 

In [22]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

In [23]:
for i in xrange(10):
    print(is_ascii(prod_sentim_test["text"].iloc[i]))

True
True
True
True
True
True
True
True
True
True
