In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
train = pd.read_csv("./open/train.csv")
test = pd.read_csv("./open/test.csv")

In [3]:
from konlpy.tag import Mecab
mecab = Mecab()

In [4]:
%%time
def pos(x):
    try:
        text = ''
        for word, pos in mecab.pos(str(x)):
            if pos in ["NNG", "NNP", "VV", "VA", "VCP", "VCN", "MM", "MAG", "XPN", "SL", "SH"]:
                if type(re.search("\W+|[0-9]", word))!=re.Match and len(word)>1: 
                    # and len(word)!=1:
                    text+=" "+word
        return text.strip()
    
    except:
        pass

train["과제명"] = train["과제명"].apply(pos)
train["요약문_연구목표"] = train["요약문_연구목표"].apply(pos)
train["요약문_연구내용"] = train["요약문_연구내용"].apply(pos)
train["요약문_기대효과"] = train["요약문_기대효과"].apply(pos)
train["요약문_한글키워드"] = train["요약문_한글키워드"].apply(pos)

test["과제명"] = test["과제명"].apply(pos)
test["요약문_연구목표"] = test["요약문_연구목표"].apply(pos)
test["요약문_연구내용"] = test["요약문_연구내용"].apply(pos)
test["요약문_기대효과"] = test["요약문_기대효과"].apply(pos)
test["요약문_한글키워드"] = test["요약문_한글키워드"].apply(pos)

CPU times: user 6min 10s, sys: 547 ms, total: 6min 11s
Wall time: 6min 12s


In [5]:
train["concat"] = train["과제명"]+" "+train["요약문_한글키워드"]

In [6]:
from sklearn.model_selection import train_test_split
train_data, valid_data, train_label, valid_label = train_test_split(train["concat"], train["label"], test_size=0.1, random_state=42)

In [7]:
test["concat"] = test["과제명"]+" "+test["요약문_한글키워드"]

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

text_clf = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf", MultinomialNB())])
text_clf = text_clf.fit(train_data, train_label)

params_clf = {"vect__ngram_range":[(1,5),(1,7),(1,9)],
              "clf__alpha":[0.01, 0.008, 0.006]}

gs_clf = GridSearchCV(text_clf, params_clf, n_jobs=5, verbose=2)
gs_clf = gs_clf.fit(train_data, train_label)

print("Best score: {}".format(gs_clf.best_score_))
best_parametors = gs_clf.best_estimator_.get_params()

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best score: 0.9160722394796412


In [9]:
best_parametors

{'memory': None,
 'steps': [('vect', CountVectorizer(ngram_range=(1, 7))),
  ('tfidf', TfidfTransformer()),
  ('clf', MultinomialNB(alpha=0.008))],
 'verbose': False,
 'vect': CountVectorizer(ngram_range=(1, 7)),
 'tfidf': TfidfTransformer(),
 'clf': MultinomialNB(alpha=0.008),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 7),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'clf__alpha': 0.008,
 'clf__class_prior': None,
 'clf__fit_prior': True}

In [10]:
pred = gs_clf.best_estimator_.predict(valid_data)
# pred = text_clf.predict(valid_data)
np.mean(pred == valid_label)

0.9262233951006827

import pickle
with open("naive_bayes_concat_5gram.pkl", "wb") as f:
    pickle.dump(text_clf, f)

In [11]:
pred = gs_clf.best_estimator_.predict(test["concat"])

In [12]:
sum(pred != 0)

7019

In [13]:
sample_submission=pd.read_csv('./open/sample_submission.csv')
sample_submission['label']=pred
sample_submission.to_csv('naive_bayes_keyname(dic_variouspos&over1).csv', index=False)

[CV] END ..........clf__alpha=0.01, vect__ngram_range=(1, 5); total time=  50.0s
[CV] END ..........clf__alpha=0.01, vect__ngram_range=(1, 7); total time= 1.3min
[CV] END ..........clf__alpha=0.01, vect__ngram_range=(1, 9); total time= 1.6min
[CV] END .........clf__alpha=0.008, vect__ngram_range=(1, 5); total time=  40.7s
[CV] END .........clf__alpha=0.008, vect__ngram_range=(1, 7); total time= 1.1min
[CV] END .........clf__alpha=0.008, vect__ngram_range=(1, 9); total time= 1.6min
[CV] END .........clf__alpha=0.006, vect__ngram_range=(1, 5); total time=  41.5s
[CV] END .........clf__alpha=0.006, vect__ngram_range=(1, 7); total time= 1.1min
[CV] END .........clf__alpha=0.006, vect__ngram_range=(1, 9); total time= 1.6min
[CV] END ..........clf__alpha=0.01, vect__ngram_range=(1, 5); total time=  51.1s
[CV] END ..........clf__alpha=0.01, vect__ngram_range=(1, 7); total time= 1.3min
[CV] END ..........clf__alpha=0.01, vect__ngram_range=(1, 9); total time= 1.6min
[CV] END .........clf__alpha