In [None]:
import MeCab
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.pipeline import FeatureUnion,Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


In [None]:
class WordDividor:
    INDEX_CATEGORY=0
    INDEX_ROOT_FORM=6
    TARGET_CATEGORIES=["名詞", "動詞",  "形容詞", "副詞", "連体詞", "感動詞"]
    
    def __init__(self,dictionary="-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd/"):
        self.dictionary=dictionary
        self.tagger=MeCab.Tagger(self.dictionary)
    
    def extract_words(self,text):
        if not (text or textone):
            return []
        
        words=[]
        
        node=self.tagger.parseToNode(text)
        while node:
            features=node.feature.split(',')
            
            if features[self.INDEX_CATEGORY] in self.TARGET_CATEGORIES:
                if features[self.INDEX_ROOT_FORM]=="*":
                    words.append(node.surface)
                else:
                    #prefer root form
                    words.append(features[self.INDEX_ROOT_FORM])
            node=node.next
        
        return words

class TextExtractor(BaseEstimator,TransformerMixin):
    def fit(self,x,y=None):
        return self
    
    def transform(self,rows):
        return rows[:,0]
    
class TextExtractorone(BaseEstimator,TransformerMixin):
    def fit(self,x,y=None):
        return self
    
    def transform(self,rows):
        return rows[:,1]

class TextExtractortwo(BaseEstimator,TransformerMixin):
    def fit(self,x,y=None):
        return self
    
    def transform(self,rows):
        return rows[:,2]
    
class OtherFeaturesExtractor(BaseEstimator,TransformerMixin):
    def fit(self,x,y=None):
        return self
    
    def transform(self,rows):
        return rows[:,1:].astype('float')


In [None]:
#Using mutiple text column

train=pd.read_csv('./data/train.csv')
test=pd.read_csv('./data/test.csv')
    
train_data=train.drop("""特定の列カラム""",axis=1)
train_data=train_data.drop("""特定の列カラム""",axis=1)

train_label=train["""特定の列カラム"""]
    
test_data=test.drop(""""特定の列カラム""",axis=1)
    
X=train_data.values
Y=train_label.values

wd = WordDividor()

clf = Pipeline([
        ('features',FeatureUnion([
            ('title',Pipeline([
                ('content',TextExtractor()),
                ('count_vector',CountVectorizer(analyzer=wd.extract_words)),
                ('tfidf',TfidfTransformer())
            ])),
            ('content',Pipeline([
                ('content',TextExtractorone()),
                ('count_vector',CountVectorizer(analyzer=wd.extract_words)),
                ('tfidf',TfidfTransformer())
            ])),
            ('day',Pipeline([
                ('content',TextExtractortwo()),
                ('count_vector',CountVectorizer(analyzer=wd.extract_words)),
                ('tfidf',TfidfTransformer())
            ])),
        ])),
        #('classifier',SGDClassifier(loss='hinge',random_state=42))
        #('classifier',RandomForestClassifier(n_estimators=100,random_state=42))
        ('classifier',LogisticRegression(class_weight = "balanced",random_state=42))
    ])
   
    
clf.fit(X,Y)
    
print("feature_union")
    
includeday=clf.predict(test_data.iloc[:,0:].values)
print(includeday)

In [None]:
submid = pd.DataFrame({'id': test["""特定の列カラム"""]})
submission = pd.concat([submid, pd.DataFrame(includeday, columns = ["""特定の列カラム"""])], axis=1)
submission.to_csv('answer.csv', index=False)