In [0]:
import json
import requests
from requests_oauthlib import OAuth1
import re
from google.colab import files


# 取得したkeyを定義
access_token = 'xxxxxxxx'
access_token_secret = 'xxxxxxxx'
consumer_key = 'xxxxxxxx'
consumer_key_secret = 'xxxxxxxx'

url = "https://stream.twitter.com/1.1/statuses/sample.json?language=ja"

# OAuth で GET
twitter = OAuth1(consumer_key, consumer_key_secret, access_token, access_token_secret)

In [0]:
def normalize_text(text):
    text = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
    text = re.sub('RT', "", text)
    text = re.sub('お気に入り', "", text)
    text = re.sub('まとめ', "", text)
    text = re.sub(r'[!-~]', "", text)
    text = re.sub(r'[︰-＠]', "", text)
    text = re.sub('\u3000',"", text)
    text = re.sub('\t', "", text)
    text = re.sub('\n', "", text)

    text = text.strip()
    return text

In [0]:
url = "https://stream.twitter.com/1.1/statuses/sample.json?language=ja"

with open('public_text_twitter.tsv','a', encoding='utf-8') as f:
    res = requests.get(url, auth=twitter, stream=True)
    for r in res.iter_lines():
        try:
            r_json = json.loads(r)
            text = r_json['text']
            f.write(normalize_text(text) + '\n')
        except:
            continue

In [0]:
files.download('public_text_twitter.tsv')

## word2vec 実践

In [0]:
# mecabインストール
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y

# mecab pythonインストール（pythonでmecabを動かすために必要)
!pip install mecab-python3==0.7

# neologd辞書インストール
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n

In [0]:
# 辞書変更
!sed -e "s!/var/lib/mecab/dic/debian!/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd!g" /etc/mecabrc  > /etc/mecabrc.new
!cp /etc/mecabrc /etc/mecabrc.org
!cp /etc/mecabrc.new /etc/mecabrc

In [0]:
import MeCab
import pandas as pd
import unicodedata
from gensim.models import word2vec

In [0]:
# データ　インポート
df = pd.read_csv('public_text_twitter.tsv', sep='\t', names=['text'])
text_lists = df['text'].unique().tolist()

mt = MeCab.Tagger("-Ochasen") 

In [0]:
word_pos = ('名詞', '形容詞')

with open('public_text_splited.txt', 'w', encoding='utf-8') as f:
    for text in text_lists:
        tmp_lists = []
        text = unicodedata.normalize('NFKC', str(text))
        
        node = mt.parseToNode(text)
        while node:
            if node.feature.startswith(word_pos) and ',非自立,' not in node.feature:
                tmp_lists.append(node.surface)
            
            node = node.next
                
        f.write(' '.join(tmp_lists) + '\n')

In [0]:
sentences = word2vec.LineSentence('public_text_splited.txt')
model = word2vec.Word2Vec(sentences,
                          sg=1,         #0: CBOW, 1: skip-gram
                          size=200,     # ベクトルの次元数
                          window=3,    # 入力単語からの最大距離
                          min_count=5,  # 単語の出現回数でフィルタリング
                          )

In [0]:
model.most_similar(positive='人生', topn=20)