# Train Classifier with TFIDF

In [67]:
import csv
import jieba
import re
import random
import numpy as np

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
# from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC

## type dict
Grammar = {'完成式': 1, '進行式': 2, '過去式': 3, '未來式': 4, '關係代名詞': 5, '不定詞': 6, '名詞子句': 7, 
           '被動': 8, '介係詞': 9, '連接詞': 10, '假設語氣': 11, '分詞': 12, 'PT': 13, '其它': 0}

In [68]:
Grammar = {'1': '完成式', '2': '進行式', '3': '過去式', '4': '未來式', '5': '關係代名詞', '6': '不定詞', '7': '名詞子句', '8': '被動', '9': '介係詞', \
           '10': '連接詞', '11': '假設語氣', '12': '分詞', '13': 'PT', '0': '其它'}

In [69]:
with open('questions_nondup_dup.csv') as csvfile:
    data_dict = defaultdict()
    for row in csv.DictReader(csvfile):
        data_dict[row['question_id']] = row

## splitting data

In [122]:
from collections import defaultdict

class DataHelper(object):
    def __init__(self, file):
        self.file = file
        self.stopwords = ['什麼', '請問', '這裡', '不是', '意思', '這邊', '謝謝', '這句', '為何', '使用', '怎麼', '要加', '老師', '還是', '如何', '甚麼', '一下', '這個', '這樣', '問為', '因為', '何要', '用過', '是不是', '一個', '應該', '直接', '好像', '如果', '何不', '兩個', '這是', '何用', '需要', '時候', '所以', '您好', '起來', '還有', '加上', '寫成', '你好', '此句', '有點', '問此', '不好意思', '不到', '像是', '這裏', '為什麼']
        
        with open('{0}'.format(self.file)) as data_file:
            self.unamb_data = defaultdict(list)
            self.amb_data = defaultdict(list)
            for row in csv.DictReader(data_file):
                if row['ambiguous'] == '0':
                    # can't directly use row.values() as it doesn't grantee the order
                    self.unamb_data[row['type']].append([row['question_id'], row['member_id'], \
                                                         row['type'], row['question'], row['ambiguous']])
                else:
                    self.amb_data[row['type']].append([row['question_id'], row['member_id'], \
                                                         row['type'], row['question'], row['ambiguous']])
                    
    def get_all_unambiguous_data(self):
        X = []
        y = []
        member_id = []
        question_id = []
        for key, record in self.unamb_data.items():
            if key == '13':
                continue
            questions = list(list(zip(*record))[3]) 
            members = list(list(zip(*record))[1]) # get memberid list from records
            question_idx = list(list(zip(*record))[0])
            X += questions
            y += [key]*len(questions)
            member_id += members
            question_id += question_idx
            
        X_text = self.cut_questions(X)
        return X_text, np.array(y), member_id, question_id
        
    def get_shuffled_data(self, ratio = 8):
        X_train = []
        X_test = []
        Y_train = []
        Y_test = []
        member_train = []
        member_test = []
        question_train = []
        question_test = []
        for key, record in self.unamb_data.items():
            if key == '13':
                continue
            questions = list(list(zip(*record))[3]) # get question list from records
            members = list(list(zip(*record))[1]) # get memberid list from records
            question_idx = list(list(zip(*record))[0])
            random.shuffle(questions)
            split_point = len(questions)*ratio//10
            train = questions[:split_point]
            test = questions[split_point:]
            member_train += members[:split_point]
            member_test += members[split_point:]
            question_train += question_idx[:split_point]
            question_test += question_idx[split_point:]
            X_train += train
            X_test += test
            Y_train += [key]*len(train) # repeat len(train) times
            Y_test += [key]*len(test)
            
        X_train_text = self.cut_questions(X_train)
        X_test_text = self.cut_questions(X_test)
        return X_train_text, np.array(Y_train), X_test_text, np.array(Y_test), member_train, member_test, question_train, question_test
    
    # use non-duplications as training and duplications as testing
    # the file should be questions_nondup_dup.csv
    def get_fixed_data(self):
        X_train = []
        X_test = []
        Y_train = []
        Y_test = []
        member_train = []
        member_test = []
        question_train = []
        question_test = []
        
        for key, record in self.unamb_data.items():
            if key == '13':
                continue
            questions = list(list(zip(*record))[3]) # get question list from records
            members = list(list(zip(*record))[1]) # get memberid list from records
            question_idx = list(list(zip(*record))[0])
            X_train += questions
            Y_train += [key]*len(questions)
            member_train += members
            question_train += question_idx
        for key, record in self.amb_data.items():
            if key == '13':
                continue
            questions = list(list(zip(*record))[3]) # get question list from records
            members = list(list(zip(*record))[1]) # get memberid list from records
            question_idx = list(list(zip(*record))[0])
            X_test += questions
            Y_test += [key]*len(questions)
            member_test += members
            question_test += question_idx
            
        X_train_text = self.cut_questions(X_train)
        X_test_text = self.cut_questions(X_test)
        return X_train_text, np.array(Y_train), X_test_text, np.array(Y_test), member_train, member_test, question_train, question_test
        
    def cut_questions(self, data):
        corpus = []
        for q in data:
            segs = jieba.cut(q, cut_all=False)
            final = [seg for seg in segs if seg not in self.stopwords]
            corpus.append(' '.join(final))
        return corpus

In [123]:
dh = DataHelper('questions_nondup_dup2.csv')
print(dh.unamb_data['2'][:3])
print(dh.amb_data['2'][:3])

[['30383', '56291', '2', '這裡的 letting 加ing是因為also的關係嗎?  You wanna be getting to know a person 是未來進行式吧?並不是因為 also 的關係喔，是因為接續了前面的 wanna be，完整一點應該是： ...and you also wanna be letting that person get to know you.  因為和前面共用一個 wanna be，所以後面省略。  這裡因為沒有 will，所以不算是未來進行式，但現在進行式本身也有未來的意涵喔。', '0'], ['28854', '56291', '2', "you'll be speaking 為甚麼speak要加ing 還有這句要如何解釋這裡是「未來進行式」的用法，表示某一動作將會、或可能在未來某一時刻進行或持續進行中。  you'll be speaking 就是「你未來、以後都會這樣說」的意思。  這裡用 you'll speak 當然也沒有問題，只是語意上有些許差別而已。", '0'], ['23656', '78952', '2', '請問My day started out walking ~walking為什麼是現在進行式?另外可否不用start out片語,用My day start to walk with my dog來表示是正確的嗎?這裡是口語上比較簡略的說法，正式書寫應該要加上 with 比較正確，寫作：  My day started out with walking my dog... 表示『用』溜狗開始我的一天。  另外並不能用 My day start to walk with my dog 這樣就變成「我的一天開始去溜狗」，但溜狗的並不是你的一天，而是你本人。  可以試著用 start with（以...開始），像這句就可以寫作： My day started with walking my dog in Central Park...My day started with walking my dog in Central Park... 在這裡，walk會用ing是因為文法句型問題還是因為前面加with?是的，start with 後面要加上一件事，因此原本的動詞要改寫為

### get shuffled data

In [88]:
X_train_text, y_train, X_test_text, y_test, member_train, member_test, question_train, question_test = dh.get_shuffled_data()
print('X train shape: {}'.format(X_train.shape))
print('y train shape: {}'.format(y_train.shape))

X train shape: (3095, 9652)
y train shape: (3095,)


## extract features of text

In [124]:
class TextFeature(object):
    def __init__(self, training_data, testing_data):
        self.training_text = training_data
        self.testing_text = testing_data
        
    def get_tfidf(self, use_idf = True):
#         texts = self.training_text + self.testing_text
        tfidf_vectorizer = TfidfVectorizer(use_idf = True)
        tfidf_vectorizer.fit(self.training_text)
        X_train = tfidf_vectorizer.transform(self.training_text)
        X_test = None
        if self.testing_text != None:
            X_test = tfidf_vectorizer.transform(self.testing_text)
        return X_train, X_test

## get features

In [90]:
tf = TextFeature(X_train_text, X_test_text)
X_train, X_test = tf.get_tfidf()
print(X_train.shape)
print(X_test.shape)

(3095, 9680)
(780, 9680)


## Corss Validation

### Get all data and get features

In [125]:
X_text, y, member, question_ids = dh.get_all_unambiguous_data()
print('X shape: {}'.format(len(X_text)))
print('y shape: {}'.format(y.shape))

tf = TextFeature(X_text, None)
X, x_test = tf.get_tfidf()

X shape: 3875
y shape: (3875,)


In [129]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import multiprocessing as mp

### Naive Bayes corss validation

In [130]:
NB_cv = Pipeline([('cls', MultinomialNB()),])
parameters = {'cls__alpha': (0.5, 0.8, 1.0, 5, 10)}
gs_cls = GridSearchCV(NB_cv, param_grid = parameters, cv = 10, n_jobs = mp.cpu_count()-1)
gs_cls = gs_cls.fit(X.todense(), y)



In [133]:
print('Best Paras:', gs_cls.best_params_)
y_predict = gs_cls.predict(X)
y_predict_prob = gs_cls.predict_proba(X)

infile = 'predicted/NB_question_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    print(len(question_test))
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)
        
print(metrics.classification_report(y, y_predict))

Best Paras: {'cls__alpha': 0.5}
780
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         2
          1       1.00      0.83      0.90       207
         10       0.89      0.92      0.91       528
         11       0.92      0.19      0.32        57
         12       0.91      0.94      0.93       515
          2       0.96      0.80      0.87       193
          3       0.92      0.85      0.88       448
          4       0.00      0.00      0.00        35
          5       0.92      0.94      0.93       462
          6       1.00      0.09      0.17        96
          7       1.00      0.11      0.20        88
          8       0.98      0.50      0.66       244
          9       0.70      0.99      0.82      1000

avg / total       0.86      0.84      0.82      3875



  'precision', 'predicted', average, warn_for)


### Random Forest cross validation

In [135]:
RF_cv = Pipeline([('cls', RandomForestClassifier()),])
parameters = {'cls__n_estimator': (10, 20, 64, 128, 256),
              'cls__max_features': ['auto', 'sqrt', 'log2', 'None']}
gs_cls = GridSearchCV(RF_cv, param_grid = parameters, cv = 10, n_jobs = mp.cpu_count()-1)
gs_cls = gs_cls.fit(X.todense(), y)



JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/usr/lib/python3.4/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    165         sys.exit(msg)
    166     main_globals = sys.modules["__main__"].__dict__
    167     if alter_argv:
    168         sys.argv[0] = mod_spec.origin
    169     return _run_code(code, main_globals, None,
--> 170                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/dist-packages/ipykernel/__main__.py')
    171 
    172 def run_module(mod_name, init_globals=None,
    173                run_name=None, alter_sys=False):
    174     """Execute a module's code without importing it

...........................................................................
/usr/lib/python3.4/runpy.py in _run_code(code=<code object <module> at 0x7f28cb9af300, file "/...3.4/dist-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module 'builtins' (built-in)>, '__cached__': '/usr/local/lib/python3.4/dist-packages/ipykernel/__pycache__/__main__.cpython-34.pyc', '__doc__': None, '__file__': '/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py', '__loader__': <_frozen_importlib.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/dist-packages/ipykernel/__main__.py'), 'app': <module 'ipykernel.kernelapp' from '/usr/local/lib/python3.4/dist-packages/ipykernel/kernelapp.py'>}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/dist-packages/ipykernel/__main__.py'), pkg_name='ipykernel', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x7f28cb9af300, file "/...3.4/dist-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module 'builtins' (built-in)>, '__cached__': '/usr/local/lib/python3.4/dist-packages/ipykernel/__pycache__/__main__.cpython-34.pyc', '__doc__': None, '__file__': '/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py', '__loader__': <_frozen_importlib.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/dist-packages/ipykernel/__main__.py'), 'app': <module 'ipykernel.kernelapp' from '/usr/local/lib/python3.4/dist-packages/ipykernel/kernelapp.py'>}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/usr/local/lib/python3.4/dist-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    469             return self.subapp.start()
    470         if self.poller is not None:
    471             self.poller.start()
    472         self.kernel.start()
    473         try:
--> 474             ioloop.IOLoop.instance().start()
    475         except KeyboardInterrupt:
    476             pass
    477 
    478 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
/usr/local/lib/python3.4/dist-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    882                 self._events.update(event_pairs)
    883                 while self._events:
    884                     fd, events = self._events.popitem()
    885                     try:
    886                         fd_obj, handler_func = self._handlers[fd]
--> 887                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    888                     except (OSError, IOError) as e:
    889                         if errno_from_exception(e) == errno.EPIPE:
    890                             # Happens when the client closes the connection
    891                             pass

...........................................................................
/usr/local/lib/python3.4/dist-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/usr/local/lib/python3.4/dist-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    271         if self.control_stream:
    272             self.control_stream.on_recv(self.dispatch_control, copy=False)
    273 
    274         def make_dispatcher(stream):
    275             def dispatcher(msg):
--> 276                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    277             return dispatcher
    278 
    279         for s in self.shell_streams:
    280             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': "RF_cv = Pipeline([('cls', RandomForestClassifier...pu_count()-1)\ngs_cls = gs_cls.fit(X.todense(), y)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 6, 11, 9, 28, 30, 436727, tzinfo=datetime.timezone.utc), 'msg_id': '57329405E7E44EEC9C38898AC59BB2C4', 'msg_type': 'execute_request', 'session': 'FFA016C43BAA4F83871A6F169C136889', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '57329405E7E44EEC9C38898AC59BB2C4', 'msg_type': 'execute_request', 'parent_header': {}})
    223             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    224         else:
    225             self.log.debug("%s: %s", msg_type, msg)
    226             self.pre_handler_hook()
    227             try:
--> 228                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'FFA016C43BAA4F83871A6F169C136889']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': "RF_cv = Pipeline([('cls', RandomForestClassifier...pu_count()-1)\ngs_cls = gs_cls.fit(X.todense(), y)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 6, 11, 9, 28, 30, 436727, tzinfo=datetime.timezone.utc), 'msg_id': '57329405E7E44EEC9C38898AC59BB2C4', 'msg_type': 'execute_request', 'session': 'FFA016C43BAA4F83871A6F169C136889', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '57329405E7E44EEC9C38898AC59BB2C4', 'msg_type': 'execute_request', 'parent_header': {}}
    229             except Exception:
    230                 self.log.error("Exception in message handler:", exc_info=True)
    231             finally:
    232                 self.post_handler_hook()

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'FFA016C43BAA4F83871A6F169C136889'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': "RF_cv = Pipeline([('cls', RandomForestClassifier...pu_count()-1)\ngs_cls = gs_cls.fit(X.todense(), y)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 6, 11, 9, 28, 30, 436727, tzinfo=datetime.timezone.utc), 'msg_id': '57329405E7E44EEC9C38898AC59BB2C4', 'msg_type': 'execute_request', 'session': 'FFA016C43BAA4F83871A6F169C136889', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '57329405E7E44EEC9C38898AC59BB2C4', 'msg_type': 'execute_request', 'parent_header': {}})
    385         if not silent:
    386             self.execution_count += 1
    387             self._publish_execute_input(code, parent, self.execution_count)
    388 
    389         reply_content = self.do_execute(code, silent, store_history,
--> 390                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    391 
    392         # Flush output before sending the reply.
    393         sys.stdout.flush()
    394         sys.stderr.flush()

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code="RF_cv = Pipeline([('cls', RandomForestClassifier...pu_count()-1)\ngs_cls = gs_cls.fit(X.todense(), y)", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = "RF_cv = Pipeline([('cls', RandomForestClassifier...pu_count()-1)\ngs_cls = gs_cls.fit(X.todense(), y)"
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=("RF_cv = Pipeline([('cls', RandomForestClassifier...pu_count()-1)\ngs_cls = gs_cls.fit(X.todense(), y)",), **kwargs={'silent': False, 'store_history': True})
    496             )
    497         self.payload_manager.write_payload(payload)
    498 
    499     def run_cell(self, *args, **kwargs):
    500         self._last_traceback = None
--> 501         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ("RF_cv = Pipeline([('cls', RandomForestClassifier...pu_count()-1)\ngs_cls = gs_cls.fit(X.todense(), y)",)
        kwargs = {'silent': False, 'store_history': True}
    502 
    503     def _showtraceback(self, etype, evalue, stb):
    504         # try to preserve ordering of tracebacks and print statements
    505         sys.stdout.flush()

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="RF_cv = Pipeline([('cls', RandomForestClassifier...pu_count()-1)\ngs_cls = gs_cls.fit(X.todense(), y)", store_history=True, silent=False, shell_futures=True)
   2712                 self.displayhook.exec_result = result
   2713 
   2714                 # Execute the user code
   2715                 interactivity = "none" if silent else self.ast_node_interactivity
   2716                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2717                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2718                 
   2719                 self.last_execution_succeeded = not has_raised
   2720 
   2721                 # Reset this so later displayed values do not modify the

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>], cell_name='<ipython-input-135-b20d01c6b1ef>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 7f2886d3df28, executi..._before_exec=None error_in_exec=None result=None>)
   2816 
   2817         try:
   2818             for i, node in enumerate(to_run_exec):
   2819                 mod = ast.Module([node])
   2820                 code = compiler(mod, cell_name, "exec")
-> 2821                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7f2886cd58a0, file "<ipython-input-135-b20d01c6b1ef>", line 5>
        result = <ExecutionResult object at 7f2886d3df28, executi..._before_exec=None error_in_exec=None result=None>
   2822                     return True
   2823 
   2824             for i, node in enumerate(to_run_interactive):
   2825                 mod = ast.Interactive([node])

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7f2886cd58a0, file "<ipython-input-135-b20d01c6b1ef>", line 5>, result=<ExecutionResult object at 7f2886d3df28, executi..._before_exec=None error_in_exec=None result=None>)
   2876         outflag = 1  # happens in more places, so it's easier as default
   2877         try:
   2878             try:
   2879                 self.hooks.pre_run_code_hook()
   2880                 #rprint('Running code', repr(code_obj)) # dbg
-> 2881                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7f2886cd58a0, file "<ipython-input-135-b20d01c6b1ef>", line 5>
        self.user_global_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'DataHelper': <class '__main__.DataHelper'>, 'Grammar': {'0': '其它', '1': '完成式', '10': '連接詞', '11': '假設語氣', '12': '分詞', '13': 'PT', '2': '進行式', '3': '過去式', '4': '未來式', '5': '關係代名詞', ...}, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "from collections import defaultdict\n\nclass DataH...pus.append(' '.join(final))\n        return corpus", "dh = DataHelper('questions_nondup_dup.csv')\nprin....unamb_data['2'][:3])\nprint(dh.amb_data['2'][:3])", 'import csv\nimport jieba\nimport re\nimport random\n...Classifier\nfrom sklearn.svm import LinearSVC, SVC', "from collections import defaultdict\n\nclass DataH...pus.append(' '.join(final))\n        return corpus", "dh = DataHelper('questions_nondup_dup.csv')\nprin....unamb_data['2'][:3])\nprint(dh.amb_data['2'][:3])", "X_train_text, y_train, X_test_text, y_test, memb...\nprint('y train shape: {}'.format(y_train.shape))", 'tf = TextFeature(X_train_text, X_test_text)\nX_tr..._tfidf()\nprint(X_train.shape)\nprint(X_test.shape)', 'class TextFeature(object):\n    def __init__(self...self.testing_text)\n        return X_train, X_test', "with open('questions_nondup_dup.csv') as csvfile...ile):\n        data_dict[row['question_id']] = row", 'NB = MultinomialNB(alpha = 1.0)\nNB.fit(X_train.t...prob[i])\n        spamwriter.writerow(writestring)', "X_train_text, y_train, X_test_text, y_test, memb...\nprint('y train shape: {}'.format(y_train.shape))", 'tf = TextFeature(X_train_text, X_test_text)\nX_tr..._tfidf()\nprint(X_train.shape)\nprint(X_test.shape)', "Grammar = {'1': '完成式', '2': '進行式', '3': '過去式', '... '11': '假設語氣', '12': '分詞', '13': 'PT', '0': '其它'}", "with open('questions_nondup_dup.csv') as csvfile...ile):\n        data_dict[row['question_id']] = row", 'NB = MultinomialNB(alpha = 1.0)\nNB.fit(X_train.t...prob[i])\n        spamwriter.writerow(writestring)', "from collections import defaultdict\n\nclass DataH...pus.append(' '.join(final))\n        return corpus", "dh = DataHelper('questions_nondup_dup.csv')\nprin....unamb_data['2'][:3])\nprint(dh.amb_data['2'][:3])", 'class TextFeature(object):\n    def __init__(self...self.testing_text)\n        return X_train, X_test', "X_train_text, y_train, X_test_text, y_test, memb...\nprint('y train shape: {}'.format(y_train.shape))", ...], 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'NB': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 'NB_cv': Pipeline(steps=[('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]), 'Out': {}, ...}
        self.user_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'DataHelper': <class '__main__.DataHelper'>, 'Grammar': {'0': '其它', '1': '完成式', '10': '連接詞', '11': '假設語氣', '12': '分詞', '13': 'PT', '2': '進行式', '3': '過去式', '4': '未來式', '5': '關係代名詞', ...}, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "from collections import defaultdict\n\nclass DataH...pus.append(' '.join(final))\n        return corpus", "dh = DataHelper('questions_nondup_dup.csv')\nprin....unamb_data['2'][:3])\nprint(dh.amb_data['2'][:3])", 'import csv\nimport jieba\nimport re\nimport random\n...Classifier\nfrom sklearn.svm import LinearSVC, SVC', "from collections import defaultdict\n\nclass DataH...pus.append(' '.join(final))\n        return corpus", "dh = DataHelper('questions_nondup_dup.csv')\nprin....unamb_data['2'][:3])\nprint(dh.amb_data['2'][:3])", "X_train_text, y_train, X_test_text, y_test, memb...\nprint('y train shape: {}'.format(y_train.shape))", 'tf = TextFeature(X_train_text, X_test_text)\nX_tr..._tfidf()\nprint(X_train.shape)\nprint(X_test.shape)', 'class TextFeature(object):\n    def __init__(self...self.testing_text)\n        return X_train, X_test', "with open('questions_nondup_dup.csv') as csvfile...ile):\n        data_dict[row['question_id']] = row", 'NB = MultinomialNB(alpha = 1.0)\nNB.fit(X_train.t...prob[i])\n        spamwriter.writerow(writestring)', "X_train_text, y_train, X_test_text, y_test, memb...\nprint('y train shape: {}'.format(y_train.shape))", 'tf = TextFeature(X_train_text, X_test_text)\nX_tr..._tfidf()\nprint(X_train.shape)\nprint(X_test.shape)', "Grammar = {'1': '完成式', '2': '進行式', '3': '過去式', '... '11': '假設語氣', '12': '分詞', '13': 'PT', '0': '其它'}", "with open('questions_nondup_dup.csv') as csvfile...ile):\n        data_dict[row['question_id']] = row", 'NB = MultinomialNB(alpha = 1.0)\nNB.fit(X_train.t...prob[i])\n        spamwriter.writerow(writestring)', "from collections import defaultdict\n\nclass DataH...pus.append(' '.join(final))\n        return corpus", "dh = DataHelper('questions_nondup_dup.csv')\nprin....unamb_data['2'][:3])\nprint(dh.amb_data['2'][:3])", 'class TextFeature(object):\n    def __init__(self...self.testing_text)\n        return X_train, X_test', "X_train_text, y_train, X_test_text, y_test, memb...\nprint('y train shape: {}'.format(y_train.shape))", ...], 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'NB': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 'NB_cv': Pipeline(steps=[('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]), 'Out': {}, ...}
   2882             finally:
   2883                 # Reset our crash handler in place
   2884                 sys.excepthook = old_excepthook
   2885         except SystemExit as e:

...........................................................................
/home/pan/Idealab/NTHU/Semester-6/NLP/term_project/<ipython-input-135-b20d01c6b1ef> in <module>()
      1 
      2 RF_cv = Pipeline([('cls', RandomForestClassifier()),])
      3 parameters = {'cls__n_estimator': (10, 20, 64, 128, 256),
      4               'cls__max_features': ['auto', 'sqrt', 'log2', 'None']}
----> 5 gs_cls = GridSearchCV(RF_cv, param_grid = parameters, cv = 10, n_jobs = mp.cpu_count()-1)
      6 gs_cls = gs_cls.fit(X.todense(), y)
      7 
      8 
      9 
     10 

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=10, error_score='raise',
       ...train_score=True,
       scoring=None, verbose=0), X=matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
   ....],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]), y=array(['1', '1', '1', ..., '2', '2', '2'], 
      dtype='<U2'), groups=None)
    940 
    941         groups : array-like, with shape (n_samples,), optional
    942             Group labels for the samples used while splitting the dataset into
    943             train/test set.
    944         """
--> 945         return self._fit(X, y, groups, ParameterGrid(self.param_grid))
        self._fit = <bound method GridSearchCV._fit of GridSearchCV(...rain_score=True,
       scoring=None, verbose=0)>
        X = matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
   ....],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])
        y = array(['1', '1', '1', ..., '2', '2', '2'], 
      dtype='<U2')
        groups = None
        self.param_grid = {'cls__max_features': ['auto', 'sqrt', 'log2', 'None'], 'cls__n_estimator': (10, 20, 64, 128, 256)}
    946 
    947 
    948 class RandomizedSearchCV(BaseSearchCV):
    949     """Randomized search on hyper parameters.

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/model_selection/_search.py in _fit(self=GridSearchCV(cv=10, error_score='raise',
       ...train_score=True,
       scoring=None, verbose=0), X=matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
   ....],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]), y=array(['1', '1', '1', ..., '2', '2', '2'], 
      dtype='<U2'), groups=None, parameter_iterable=<sklearn.model_selection._search.ParameterGrid object>)
    559                                   fit_params=self.fit_params,
    560                                   return_train_score=self.return_train_score,
    561                                   return_n_test_samples=True,
    562                                   return_times=True, return_parameters=True,
    563                                   error_score=self.error_score)
--> 564           for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.model_selection._search.ParameterGrid object>
    565           for train, test in cv_iter)
    566 
    567         # if one choose to see train score, "out" will contain train score info
    568         if self.return_train_score:

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=7), iterable=<generator object <genexpr>>)
    763             if pre_dispatch == "all" or n_jobs == 1:
    764                 # The iterable was consumed all at once by the above for loop.
    765                 # No need to wait for async callbacks to trigger to
    766                 # consumption.
    767                 self._iterating = False
--> 768             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=7)>
    769             # Make sure that we get a last message telling us we are done
    770             elapsed_time = time.time() - self._start_time
    771             self._print('Done %3i out of %3i | elapsed: %s finished',
    772                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Sun Jun 11 17:28:31 2017
PID: 5527                                    Python 3.4.3: /usr/bin/python3
...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (Pipeline(steps=[('cls', RandomForestClassifier(b...None,
            verbose=0, warm_start=False))]), matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
   ....],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]), array(['1', '1', '1', ..., '2', '2', '2'], 
      dtype='<U2'), <function _passthrough_scorer>, array([  21,   22,   23, ..., 3872, 3873, 3874]), array([   0,    1,    2,    3,    4,    5,    6,... 3694, 3695, 3696, 3697, 3698, 3699, 3700, 3701]), 0, {'cls__max_features': 'auto', 'cls__n_estimator': 10}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (Pipeline(steps=[('cls', RandomForestClassifier(b...None,
            verbose=0, warm_start=False))]), matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
   ....],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]), array(['1', '1', '1', ..., '2', '2', '2'], 
      dtype='<U2'), <function _passthrough_scorer>, array([  21,   22,   23, ..., 3872, 3873, 3874]), array([   0,    1,    2,    3,    4,    5,    6,... 3694, 3695, 3696, 3697, 3698, 3699, 3700, 3701]), 0, {'cls__max_features': 'auto', 'cls__n_estimator': 10})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=Pipeline(steps=[('cls', RandomForestClassifier(b...None,
            verbose=0, warm_start=False))]), X=matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
   ....],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]), y=array(['1', '1', '1', ..., '2', '2', '2'], 
      dtype='<U2'), scorer=<function _passthrough_scorer>, train=array([  21,   22,   23, ..., 3872, 3873, 3874]), test=array([   0,    1,    2,    3,    4,    5,    6,... 3694, 3695, 3696, 3697, 3698, 3699, 3700, 3701]), verbose=0, parameters={'cls__max_features': 'auto', 'cls__n_estimator': 10}, fit_params={}, return_train_score=True, return_parameters=True, return_n_test_samples=True, return_times=True, error_score='raise')
    222     fit_params = fit_params if fit_params is not None else {}
    223     fit_params = dict([(k, _index_param_value(X, v, train))
    224                       for k, v in fit_params.items()])
    225 
    226     if parameters is not None:
--> 227         estimator.set_params(**parameters)
        estimator.set_params = <bound method Pipeline.set_params of Pipeline(st...one,
            verbose=0, warm_start=False))])>
        parameters = {'cls__max_features': 'auto', 'cls__n_estimator': 10}
    228 
    229     start_time = time.time()
    230 
    231     X_train, y_train = _safe_split(estimator, X, y, train)

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/pipeline.py in set_params(self=Pipeline(steps=[('cls', RandomForestClassifier(b...None,
            verbose=0, warm_start=False))]), **kwargs={'cls__max_features': 'auto', 'cls__n_estimator': 10})
    175 
    176         Returns
    177         -------
    178         self
    179         """
--> 180         self._set_params('steps', **kwargs)
        self._set_params = <bound method Pipeline._set_params of Pipeline(s...one,
            verbose=0, warm_start=False))])>
        kwargs = {'cls__max_features': 'auto', 'cls__n_estimator': 10}
    181         return self
    182 
    183     def _validate_steps(self):
    184         names, estimators = zip(*self.steps)

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/pipeline.py in _set_params(self=Pipeline(steps=[('cls', RandomForestClassifier(b...None,
            verbose=0, warm_start=False))]), steps_attr='steps', **params={'cls__max_features': 'auto', 'cls__n_estimator': 10})
     64         step_names, _ = zip(*getattr(self, steps_attr))
     65         for name in list(six.iterkeys(params)):
     66             if '__' not in name and name in step_names:
     67                 self._replace_step(steps_attr, name, params.pop(name))
     68         # 3. Step parameters and other initilisation arguments
---> 69         super(_BasePipeline, self).set_params(**params)
        self.set_params = <bound method Pipeline.set_params of Pipeline(st...one,
            verbose=0, warm_start=False))])>
        params = {'cls__max_features': 'auto', 'cls__n_estimator': 10}
     70         return self
     71 
     72     def _validate_names(self, names):
     73         if len(set(names)) != len(names):

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/base.py in set_params(self=Pipeline(steps=[('cls', RandomForestClassifier(b...None,
            verbose=0, warm_start=False))]), **params={'cls__max_features': 'auto', 'cls__n_estimator': 10})
    279                     raise ValueError('Invalid parameter %s for estimator %s. '
    280                                      'Check the list of available parameters '
    281                                      'with `estimator.get_params().keys()`.' %
    282                                      (name, self))
    283                 sub_object = valid_params[name]
--> 284                 sub_object.set_params(**{sub_name: value})
        sub_object.set_params = <bound method RandomForestClassifier.set_params ...e=None,
            verbose=0, warm_start=False)>
        sub_name = 'n_estimator'
        value = 10
    285             else:
    286                 # simple objects case
    287                 if key not in valid_params:
    288                     raise ValueError('Invalid parameter %s for estimator %s. '

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/base.py in set_params(self=RandomForestClassifier(bootstrap=True, class_wei...te=None,
            verbose=0, warm_start=False), **params={'n_estimator': 10})
    286                 # simple objects case
    287                 if key not in valid_params:
    288                     raise ValueError('Invalid parameter %s for estimator %s. '
    289                                      'Check the list of available parameters '
    290                                      'with `estimator.get_params().keys()`.' %
--> 291                                      (key, self.__class__.__name__))
        key = 'n_estimator'
        self.__class__.__name__ = 'RandomForestClassifier'
    292                 setattr(self, key, value)
    293         return self
    294 
    295     def __repr__(self):

ValueError: Invalid parameter n_estimator for estimator RandomForestClassifier. Check the list of available parameters with `estimator.get_params().keys()`.
___________________________________________________________________________

In [None]:
print('Best Paras:', gs_cls.best_params_)
y_predict = gs_cls.predict(X)
y_predict_prob = gs_cls.predict_proba(X)

infile = 'predicted/RF_question_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    print(len(question_test))
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)
        
print(metrics.classification_report(y, y_predict))

## Naive Bayes

In [93]:
NB = MultinomialNB(alpha = 1.0)
NB.fit(X_train.todense(), y_train)
y_predict = NB.predict(X_test.todense())
print(metrics.classification_report(y_test, y_predict))

y_predict_prob = NB.predict_proba(X_test.todense())
infile = 'predicted/NB_question_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    print(len(question_test))
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       0.96      0.55      0.70        42
         10       0.90      0.65      0.75       106
         11       0.00      0.00      0.00        12
         12       0.83      0.84      0.84       103
          2       0.87      0.51      0.65        39
          3       0.86      0.71      0.78        90
          4       0.00      0.00      0.00         7
          5       0.87      0.81      0.84        93
          6       0.00      0.00      0.00        20
          7       0.00      0.00      0.00        18
          8       1.00      0.20      0.34        49
          9       0.52      0.99      0.69       200

avg / total       0.73      0.70      0.67       780

780


  'precision', 'predicted', average, warn_for)


## Random Forest

In [59]:
RF  = RandomForestClassifier(n_jobs=-1, max_features="sqrt", n_estimators=128)
RF.fit(X_train.todense(), y_train)
y_predicted = RF.predict(X_test)
print(metrics.classification_report(y_test, y_predicted))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       1.00      0.95      0.98        42
         10       0.96      0.93      0.95       106
         11       1.00      0.83      0.91        12
         12       0.95      0.97      0.96       103
          2       0.97      0.87      0.92        39
          3       0.90      0.88      0.89        90
          4       1.00      0.57      0.73         7
          5       0.87      0.99      0.92        93
          6       1.00      0.85      0.92        20
          7       1.00      0.61      0.76        18
          8       0.95      0.84      0.89        49
          9       0.87      0.95      0.91       200

avg / total       0.92      0.92      0.92       780



  'precision', 'predicted', average, warn_for)


## SVM

In [60]:
svc = LinearSVC(C=1.0, max_iter=10000)
svc = svc.fit(X = X_train.todense(), y = y_train)
y_predict = svc.predict(X = X_test)
print(metrics.classification_report(y_test, y_predict))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       1.00      1.00      1.00        42
         10       0.95      0.94      0.95       106
         11       1.00      0.67      0.80        12
         12       0.93      0.96      0.94       103
          2       0.95      0.92      0.94        39
          3       0.93      0.91      0.92        90
          4       1.00      1.00      1.00         7
          5       0.93      0.97      0.95        93
          6       0.90      0.95      0.93        20
          7       0.80      0.67      0.73        18
          8       0.95      0.86      0.90        49
          9       0.92      0.95      0.94       200

avg / total       0.93      0.93      0.93       780



  'precision', 'predicted', average, warn_for)


### get fixed data

In [61]:
X_train_text, y_train, X_test_text, y_test, member_train, member_test, question_train, question_test = dh.get_fixed_data()
print('X train shape: {}'.format(len(X_train_text)))
print('y train shape: {}'.format(y_train.shape))

X train shape: 3875
y train shape: (3875,)


In [62]:
tf = TextFeature(X_train_text, X_test_text)
X_train, X_test = tf.get_tfidf()
print(X_train.shape)
print(X_test.shape)

(3875, 10760)
(1949, 10760)


## write predicted results into file

In [65]:
NB = MultinomialNB(alpha = 1.0)
NB.fit(X_train.todense(), y_train)
y_predict_prob = NB.predict_proba(X_test.todense())
cat = [Grammar[item] for item in NB.classes_]
out_NB = 'predicted/NB_predicted.csv'
with open(out_NB, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

In [66]:
RF  = RandomForestClassifier(n_jobs=-1, max_features="sqrt", n_estimators=128)
RF.fit(X_train.todense(), y_train)
y_predict_prob = RF.predict_proba(X_test.todense())
cat = [Grammar[item] for item in RF.classes_]
out_RF = 'predicted/RF_predicted.csv'
with open(out_RF, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

In [223]:
SVC = LinearSVC(C=1.0, max_iter=10000)
SVC = SVC.fit(X = X_train.todense(), y = y_train)
y_predict = SVC.predict(X_test.todense())
# cat = [Grammar[item] for item in SVC.classes_]
out_SVC = 'predicted/SVC_predicted.csv'
with open(out_SVC, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += [Grammar[y_predict[i]]]
        spamwriter.writerow(writestring)

## Convert csv to json

In [None]:
import json
def conver2json(infile, outfile = 'predicted/NB_question_predict.json'):
    predict_dict = defaultdict(dict)

    predict_dict = defaultdict(dict)
    with open(infile, 'r') as csvfile:
        for row in csv.DictReader(csvfile):
            predict_dict[row['question_id']]['member_id'] = row['member_id']
            predict_dict[row['question_id']]['question'] = row['question']
            predict_dict[row['question_id']]['reply'] = 'NO!'
            predict_dict[row['question_id']]['其它'] = row['其它']
            predict_dict[row['question_id']]['完成式'] = row['完成式']
            predict_dict[row['question_id']]['連接詞'] = row['連接詞']
            predict_dict[row['question_id']]['假設語氣'] = row['假設語氣']
            predict_dict[row['question_id']]['分詞'] = row['分詞']
            predict_dict[row['question_id']]['進行式'] = row['進行式']
            predict_dict[row['question_id']]['過去式'] = row['過去式']
            predict_dict[row['question_id']]['未來式'] = row['未來式']
            predict_dict[row['question_id']]['關係代名詞'] = row['關係代名詞']
            predict_dict[row['question_id']]['不定詞'] = row['不定詞']
            predict_dict[row['question_id']]['名詞子句'] = row['名詞子句']
            predict_dict[row['question_id']]['被動'] = row['被動']
            predict_dict[row['question_id']]['介係詞'] = row['介係詞']
            predict_dict[row['question_id']]['question_type'] = 0
    
    with open(outfile, 'w') as jsonfile:
        json.dump(predict_dict, jsonfile)

In [None]:
infile = 'predicted/NB_question_predicted.csv'
outfile = 'predicted/NB_question_predicted.json'
conver2json(infile)