In [5]:
import pandas as pd
import csv
from pprint import pprint
import MeCab
import regex
from collections import Counter

In [6]:
# コンストラクタと形態素解析
class IndependentWord:
    def __init__(self, dictpath="/usr/local/mecab/lib/mecab/dic/mecab-ipadic-neologd/"):
        if dictpath:
            self.m = MeCab.Tagger("-d"+dictpath)
            
        else:
            self.m = MeCab.Tagger()   
#         self.kanaalpha = regex.compile(r'[\p{Script=Hiragana}\p{Script=Katakana}ーA-Za-z]+')
        self.kanaalpha = regex.compile(r'[\p{Script=Hiragana}ーA-Za-z]+')
        self.number = regex.compile("[0-9０-９]+")
    class Constant:
        BASIC = "basic_form" #原型
        SURFACE = "surface_form" #表層型
        POS = "pos" #品詞
        POS_DETAIL_1 = "pos_detail_1" #品詞詳細1
        POS_DETAIL_2 = "pos_detail_2" #品詞詳細2
        POS_DETAIL_3 = "pos_detail_3" #品詞詳細3
        PRONUNCIATION = "pronunciation" #発音
        READING = "reading" #読み
        CONJUGATED_TYPE = "conjugated_type" #活用
        CONJUGATED_FORM = "conjugated_form" #活用形

  #mecabの出力行をobjectに変換
  #mecabの出力フォーマットに応じて適宜修正する
    def mecabLineToDict(self, line):
        surface, tmp = line.split("\t")
        others = tmp.split(",")
        
        Const = self.Constant
        return {
            Const.SURFACE: surface,
            Const.POS: others[0],
            Const.POS_DETAIL_1: others[1],
            Const.POS_DETAIL_2: others[2],
            Const.POS_DETAIL_3: others[3],
            Const.CONJUGATED_TYPE: others[4],
            Const.CONJUGATED_FORM: others[5],
            Const.BASIC: others[6],
            Const.READING: others[7],
            Const.PRONUNCIATION: others[8]
        }
  #自立語かどうかの判定
    def isIndependentWord(self, token):
        pos = token[self.Constant.POS]
        pos_detail_1 = token[self.Constant.POS_DETAIL_1]
        if pos == "名詞" and pos_detail_1 in ['一般','固有名詞','サ変接続','形容動詞語幹','副詞可能']: #用途によっては「副詞可能」を足しても良いかもしれません
            return True
        elif pos == '形容詞' and pos_detail_1 == '自立':
            return True
        elif pos == "副詞" and pos_detail_1 == "一般":
            return True
        elif pos == "動詞" and pos_detail_1 == "自立":
            return True
        else:
            return False
  #カナやアルファベット１文字や数字出ないかの判定
    def isReliableWord(self, token):
        surface = token[self.Constant.SURFACE]
        if self.number.fullmatch(surface):
            return False
        elif self.kanaalpha.fullmatch(surface):
            return False
        else:
            return True

  #自立語の原型を抽出
    def extract(self,text):
        lines = self.m.parse(text).splitlines()[:-1]
        tokens = [self.mecabLineToDict(line) for line in lines]

        independent_words = []
        for token in tokens:
#             if self.isIndependentWord(token):
            if self.isIndependentWord(token) and self.isReliableWord(token):
                surface = token[self.Constant.SURFACE]
                basic = token[self.Constant.BASIC]
                if basic == "*":
                    independent_words.append(surface)
                else:
                    independent_words.append(basic)
                    
        return independent_words

# 一般市民_心豊かに暮らせるまちづくり

In [7]:
df = pd.read_csv("../一般市民/舞鶴市_一般市民_1.csv")
list = []
for i in df['一般市民']:
    s = i.lstrip('・').replace('(','').replace(')','').replace('\u3000','').replace('TNR','')
    list.append(s)

# 実行
res = []
if __name__ == "__main__":
    for i in list:
        idptwd = IndependentWord()
        result = idptwd.extract(i)
        res.append(result)
        
# ストップワード
stop = ['街', 'まち', '町', '住む', '人', 'ひと', '舞鶴', '思う']
words = []

for i in res:
    for t in i:
        if t not in stop:
            words.append (t)

#頻出単語を出力
mine = Counter(words)
for i in mine.most_common():
    print(i)

('自然', 26)
('明るい', 21)
('豊か', 18)
('元気', 17)
('歴史', 17)
('子ども', 16)
('楽しい', 15)
('優しい', 14)
('田舎', 12)
('暮らす', 12)
('良い', 12)
('暮らせる', 12)
('地域', 11)
('今', 11)
('増える', 11)
('文化', 11)
('子供達', 11)
('来る', 11)
('場所', 11)
('大切', 10)
('魅力', 10)
('心', 9)
('多い', 9)
('安心', 9)
('市', 8)
('思える', 8)
('美しい', 7)
('笑顔', 7)
('芸術', 7)
('海', 7)
('交流', 7)
('世代', 7)
('年齢', 7)
('過ごせる', 7)
('方々', 7)
('持つ', 7)
('市民', 6)
('楽しめる', 6)
('都会', 6)
('教育', 6)
('年寄り', 6)
('高齢者', 6)
('作る', 6)
('考える', 6)
('施設', 6)
('知る', 6)
('温かい', 5)
('仲良く', 5)
('生活', 5)
('幸せ', 5)
('北部', 5)
('子供', 5)
('残す', 5)
('充実', 5)
('自分', 5)
('感じる', 5)
('遊べる', 5)
('良さ', 5)
('仕事', 5)
('医療', 5)
('居心地', 4)
('暖かい', 4)
('面白い', 4)
('若い人', 4)
('思いやり', 4)
('舞鶴市民', 4)
('人達', 4)
('力', 4)
('舞鶴市', 4)
('気持ち', 4)
('生きる', 4)
('発展', 4)
('嬉しい', 4)
('進む', 4)
('山', 4)
('言葉', 4)
('市外', 4)
('地元', 4)
('年配', 4)
('様々', 4)
('持てる', 4)
('文化的', 3)
('元気な町', 3)
('住民', 3)
('溢れる', 3)
('静か', 3)
('吹奏楽', 3)
('大事', 3)
('人々', 3)
('京都', 3)
('楽しみ', 3)
('一人一人', 3)
('環境', 3)
('残る', 3)
('

In [5]:
#ワードクラウドの作成
from matplotlib import pyplot as plt
from wordcloud import WordCloud
 
# fpath = "/Library/Fonts/ヒラギノ角ゴシック W3.ttc"
w = WordCloud(background_color="white", width=600, height=400, min_font_size=15)
w.generate(words)
 
# wordcloud.to_file("./wordcloud.png")
plt.imshow(W)
plt.axis('off')
plt.show()

TypeError: expected string or bytes-like object

In [None]:
df = pd.read_csv("~/class_submit/舞鶴市/subject/people.csv")