In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
# mecabインストール
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y

# mecab pythonインストール（pythonでmecabを動かすために必要)
!pip install mecab-python3==0.7

# neologd辞書インストール
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n

# 辞書変更
!sed -e "s!/var/lib/mecab/dic/debian!/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd!g" /etc/mecabrc > /etc/mecabrc.new
!cp /etc/mecabrc /etc/mecabrc.org
!cp /etc/mecabrc.new /etc/mecabrc

import MeCab

## データ読み込み

In [0]:
data_tweet = pd.read_csv('tweets.tsv',  sep="\t")
data_tweet = data_tweet.dropna()
Y = data_tweet.iloc[:,1].values

print(data_tweet.head())

In [0]:
tagger = MeCab.Tagger()
tagger.parse('')

# 文字列を単語で分割しリストに格納する
def word_tokenaize(texts):
    node = tagger.parseToNode(texts)
    word_list = []
    while node:
        word_type = node.feature.split(",")[0]
        if (word_type == '名詞')|(word_type == '形容詞'):
            word = node.feature.split(",")[6]
            if word != '*':
              word_list.append(word)
        node = node.next

    return word_list

In [0]:
vectorizer = TfidfVectorizer(tokenizer=word_tokenaize)

tweet_matrix = vectorizer.fit_transform(data_tweet.iloc[:,0])
X = tweet_matrix.toarray()
print(X.shape)

## ロジスティック回帰を実践

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

logit_multi2 = LogisticRegression()
logit_multi2.fit(X_train, y_train)

print(logit_multi2.coef_)
print(logit_multi2.intercept_)

In [0]:
y_pred = logit_multi2.predict(X_test)
print(accuracy_score(y_test, y_pred))