In [18]:
import numpy as np
import json
import requests
from requests_oauthlib import OAuth1Session
import re
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 取得したkey情報
import configure

twitter = OAuth1Session(configure.consumer_key, configure.consumer_key_secret, configure.access_token, configure.access_token_secret)

In [19]:
# タイムライン取得用のURL
url = "https://api.twitter.com/1.1/statuses/user_timeline.json"

# URLや特殊文字など削除
def normalize_text(text):
    text = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
    text = re.sub('RT', "", text)
    text = re.sub('お気に入り', "", text)
    text = re.sub('まとめ', "", text)
    text = re.sub(r'[!-~]', "", text)
    text = re.sub(r'[︰-＠]', "", text)
    text = re.sub('\u3000',"", text)
    text = re.sub('\t', "", text)
    text = re.sub('\n', "", text)
    text = text.strip()
    return text

# パラメータの定義
params = {
    'screen_name': 'Np_Ur_',
    'exclude_replies': True,
    'include_rts': False,
    'count': 200
}

f_out = open('np_ur_.tsv', 'w')

for _ in range(10):
    res = twitter.get(url, params = params)
    
    if res.status_code == 200:
        
        timeline = json.loads(res.text)
        if len(timeline) == 0:
            break
        
        # 各ツイートの本文を表示
        for i in range(len(timeline)):
            f_out.write(normalize_text(timeline[i]['text']) + '\t' + "0" + '\n')
            
        # 一番最後のツイートIDをパラメータmax_idに追加
        params['max_id'] = timeline[len(timeline) - 1]['id'] - 1

f_out.close()

In [20]:
# パラメータの定義
params = {
    'screen_name': 'lucky_CandR',
    'exclude_replies': True,
    'include_rts': False,
    'count': 200
}

f_out = open('lucky_CandR.tsv', 'w')

for _ in range(10):
    res = twitter.get(url, params = params)
    
    if res.status_code == 200:
        
        timeline = json.loads(res.text)
        if len(timeline) == 0:
            break
        
        # 各ツイートの本文を表示
        for i in range(len(timeline)):
            f_out.write(normalize_text(timeline[i]['text']) + '\t' + "1" + '\n')
            
        # 一番最後のツイートIDをパラメータmax_idに追加
        params['max_id'] = timeline[len(timeline) - 1]['id'] - 1

f_out.close()

In [21]:
# データ統合
import pandas as pd

tsv_files = ['np_ur_.tsv', 'lucky_CandR.tsv']
list = []

for file in tsv_files:
    list.append(pd.read_csv(file, delimiter='\t', header=None))
df = pd.concat(list, sort=False)

df.to_csv('tweets.tsv', sep='\t', index=False)

In [22]:
import MeCab

tagger = MeCab.Tagger()
# 初期化
tagger.parse('')

node = tagger.parseToNode("AKB48よりも乃木坂のほうが好き")
while node:
    print(node.surface, node.feature)
    node = node.next

 BOS/EOS,*,*,*,*,*,*,*,*
AKB 名詞,一般,*,*,*,*,*
48 名詞,数,*,*,*,*,*
より 助詞,格助詞,一般,*,*,*,より,ヨリ,ヨリ
も 助詞,係助詞,*,*,*,*,も,モ,モ
乃木坂 名詞,固有名詞,一般,*,*,*,乃木坂,ノギザカ,ノギザカ
の 助詞,連体化,*,*,*,*,の,ノ,ノ
ほう 名詞,非自立,一般,*,*,*,ほう,ホウ,ホー
が 助詞,格助詞,一般,*,*,*,が,ガ,ガ
好き 名詞,形容動詞語幹,*,*,*,*,好き,スキ,スキ
 BOS/EOS,*,*,*,*,*,*,*,*


In [23]:
data_tweet = pd.read_csv('tweets.tsv', sep="\t")
data_tweet = data_tweet.dropna()
y = data_tweet.iloc[:, 1].values

In [24]:
tagger = MeCab.Tagger()
tagger.parse('')

# 文字列を単語で分割しリストに格納する
def word_tokenaize(texts):
    node = tagger.parseToNode(texts)
    word_list = []
    while node:
        word_type = node.feature.split(",")[0]
        if (word_type == '名詞')|(word_type == '形容詞'):
            word = node.feature.split(",")[6]
            if word != "*":
                word_list.append(word)
        node = node.next
    
    return word_list

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

vectorizer = TfidfVectorizer(tokenizer=word_tokenaize)

tweet_matrix = vectorizer.fit_transform(data_tweet.iloc[:, 0])
X = tweet_matrix.toarray()
print(X.shape)

(840, 1812)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

logit_multi2 = LogisticRegression()
logit_multi2.fit(X_train, y_train)

print(logit_multi2.coef_)
print(logit_multi2.intercept_)

[[-0.04031432 -0.0486348   0.41482394 ...  0.1194566  -0.03842173
   0.        ]]
[-1.65414262]




In [28]:
y_pred = logit_multi2.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9345238095238095
