# 環境構築（Environment Setup）

In [None]:
# ライブラリ導入
!pip install mecab-python3
!pip install unidic
!python -m unidic download
!pip install pyvis==0.2.1
import pandas as pd
import numpy as np
import unicodedata
import MeCab
from collections import Counter
import requests
import matplotlib.pyplot as plt
import unidic
import re
import os
import itertools
from pyvis.network import Network
from IPython.display import display, HTML

# Google Drive マウント
from google.colab import drive
drive.mount('/content/drive')

# 作業ディレクトリの変更
os.chdir('/content/drive/MyDrive/User')

# データプロセシング（Data Processing）

In [None]:
# CSV取得
cs_hist_df = pd.read_csv('cs_hist.csv', encoding = 'cp932')
incident_mgmt_df = pd.read_csv('incident_mgmt.csv', encoding = 'cp932')

# データクレンジング関数の宣言
def replace_words(text, word_to_remove, word_to_replace):
  return text.replace(word_to_remove, word_to_replace)

# データクレンジング実行
word_to_remove = '\n'
word_to_replace = ''
cs_hist_df_edit = cs_hist_df.applymap(lambda x: replace_words(x, word_to_remove, word_to_replace) if isinstance(x, str) else x).fillna('').loc[:, ['問合わせ番号', '件名']]
incident_mgmt_df_edit = incident_mgmt_df.applymap(lambda x: replace_words(x, word_to_remove, word_to_replace) if isinstance(x, str) else x).fillna('').loc[:, ['問合せ番号']]

# 顧客対応からインシデントのみ抽出
cs_incident_df = cs_hist_df_edit[cs_hist_df_edit['問合わせ番号'].isin(incident_mgmt_df_edit['問合せ番号'])].reset_index(drop=True)

# 形態素解析（Morphological Analysis）

In [None]:
# 形態素解析関数の宣言
def morpheme_tokenizer(text_input):
  # 形態素解析関数の宣言
  def morpheme(text):
    mecab = MeCab.Tagger()
    cell_parse = mecab.parse(text)
    lines = cell_parse.splitlines()
    # EOS（End Of Sentence）の削除
    lines = lines[:-1]
    data = []
    # 各カラムの分離
    for line in lines:
      surface, feature = line.split('\t')
      feature = [None if f == '*' else f for f in feature.split(',')]
      data.append([surface, *feature])
    return pd.DataFrame(data)

  # 形態素解析の実行及びデータフレーム化
  words_df = morpheme(text_input)

  # 表層形・品詞・原形のみ抽出
  if not words_df.empty:
    surface_parts_df = words_df.iloc[:, 0:2].reset_index(drop=True)
    original = words_df.iloc[:, 8:9].reset_index(drop=True)
    words_df_edit = pd.concat([surface_parts_df, original], axis=1)

    COLUMNS = ['表層形', '品詞', '原形']
    words_df_edit.columns = COLUMNS

    # 名詞・形容詞・動詞・副詞のみ抽出
    words_df_edit = words_df_edit[words_df_edit['品詞'].isin(['名詞', '形容詞', '動詞', '副詞'])]

    # 形態素解析結果をリスト化
    word_cloud_list = list(words_df_edit['原形'].dropna())

    # 漢字のみ抽出
    def remove_eng(word):
        return re.sub(r'-[a-zA-Z]+', '', word)
    kana_re = re.compile("[^あ-ゖ]")
    word_cloud_list_edit = [s for s in word_cloud_list if kana_re.match(s)]
    word_cloud_list_edit = [remove_eng(t) for t in word_cloud_list_edit]
    return word_cloud_list_edit
  else:
    return []

# 共起ネットワーク

In [None]:
# コロケーションのカウント
cells = [morpheme_tokenizer(cell) for cell in cs_incident_df['件名']]
cells_combs = [list(itertools.combinations(cell,2)) for cell in cells]
words_combs = [[tuple(sorted(words)) for words in cell] for cell in cells_combs]
target_combs = []
for words_comb in words_combs:
  target_combs.extend(words_comb)

combs_count = Counter(target_combs)

combs_df = pd.DataFrame([{"前" : i[0][0], "後": i[0][1], "count":i[1]} for i in combs_count.most_common()])

In [None]:
# 共起ネットワーク関数の宣言
def co_occ_network(df):
  got_net = Network(
    height = '500px',
    width = '95%',
    bgcolor="white",
    font_color="black",
    notebook=True
    )

  got_net.force_atlas_2based()
  got_data = df[:150]

  sources = got_data['前']
  targets = got_data['後']
  weights = got_data['count']

  edge_data = zip(sources, targets, weights)

  for e in edge_data:
    src = e[0]
    dst = e[1]
    w = e[2]

    got_net.add_node(src, src, title=src)
    got_net.add_node(dst, dst, title=dst)
    got_net.add_edge(src, dst, value=w)

  neighbor_map = got_net.get_adj_list()

  for node in got_net.nodes:
    node["title"] += " Neighbors:<br>" + "<br>".join(neighbor_map[node["id"]])
    node["value"] = len(neighbor_map[node["id"]])

  got_net.show_buttons(filter_=['physics'])
  return got_net

In [None]:
# 共起ネットワークの表示
got_net = co_occ_network(combs_df)
got_net.show('co_occurrence_network.html')

display(HTML('co_occurrence_network.html'))