# 環境構築（Environment Setup）

In [None]:
# ライブラリ導入
!pip install mecab-python3
!pip install unidic
!python -m unidic download
!apt-get -yq install fonts-ipafont-gothic
import pandas as pd
import numpy as np
import unicodedata
import MeCab
from collections import Counter
import requests
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from IPython.display import display, HTML
import unidic
import re
import os

# Google Drive マウント
from google.colab import drive
drive.mount('/content/drive')

# 作業ディレクトリの変更
os.chdir('/content/drive/MyDrive/User')

# データプロセシング（Data Processing）

In [None]:
# CSV取得
cs_hist_df = pd.read_csv('cs_hist.csv', encoding = 'cp932')
incident_mgmt_df = pd.read_csv('incident_mgmt.csv', encoding = 'cp932')

# データクレンジング関数の宣言
def remove_words(text, word_to_remove):
  return text.replace(word_to_remove, '')

# データクレンジング実行
word_to_remove = '\n'
cs_hist_df_edit = cs_hist_df.applymap(lambda x: remove_words(x, word_to_remove) if isinstance(x, str) else x).fillna('').loc[:, ['問合わせ番号', '件名']]
incident_mgmt_df_edit = incident_mgmt_df.applymap(lambda x: remove_words(x, word_to_remove) if isinstance(x, str) else x).fillna('').loc[:, ['問合せ番号']]

# 顧客対応からインシデントのみ抽出
cs_incident_df = cs_hist_df_edit[cs_hist_df_edit['問合わせ番号'].isin(incident_mgmt_df_edit['問合せ番号'])].reset_index(drop=True)

# 形態素解析（Morphological Analysis）

In [None]:
# 形態素解析関数の宣言
def morpheme(dataframe_input, column_input, index_input):
  mecab = MeCab.Tagger()
  text = dataframe_input[column_input][index_input]
  cell_parse = mecab.parse(text)
  lines = cell_parse.splitlines()
  # EOS(End Of Sentence)の削除
  lines = lines[:-1]

  # カラムごとの分離・リスト化
  data = []
  for line in lines:
    surface, feature = line.split('\t')
    feature = [None if f == '*' else f for f in feature.split(',')]
    data.append([surface, *feature])
  return pd.DataFrame(data)

In [None]:
# 形態素解析実行関数の宣言
def morpheme_df(mor_df_input):
  # 形態素解析の実行及びデータフレーム化
  words_df = morpheme(mor_df_input, '件名', 1)
  for index, row in mor_df_input.iloc[2:].iterrows():
    mor_df = morpheme(mor_df_input, '件名', index)
    words_df = pd.concat([words_df, mor_df])

  # 表層形・品詞・原形のみ抽出
  surface_parts_df = words_df.iloc[:, 0:2].reset_index(drop = True)
  original = words_df.iloc[:, 8:9].reset_index(drop = True)
  words_df_edit = pd.concat([surface_parts_df, original], axis = 1)

  COLUMNS = ['表層形', '品詞', '原形']
  words_df_edit.columns = COLUMNS

  # 名詞・形容詞・動詞・副詞のみ抽出
  words_df_edit = words_df_edit[words_df_edit['品詞'].isin(['名詞', '形容詞', '動詞', '副詞'])]

  # 形態素解析結果をリスト化
  word_cloud_list = list(words_df_edit['原形'].dropna())

  # 漢字のみ抽出
  def remove_eng(word):
    return re.sub(r'-[a-zA-Z]+', '', word)
  kana_re = re.compile("[^あ-ゖ]")
  word_cloud_list_edit = [s for s in word_cloud_list if kana_re.match(s)]
  word_cloud_list_edit = [remove_eng(t) for t in word_cloud_list_edit]

  # 重複要素をカウント
  word_counts = Counter(word_cloud_list_edit)
  word_counts_df = pd.DataFrame.from_dict(word_counts, orient='index').reset_index()
  word_counts_df.columns = ['Word', 'Count']
  word_counts_df = word_counts_df.sort_values(by = 'Count', ascending = False).reset_index(drop = True)

  return word_counts_df

In [None]:
# 形態素解析の実行
cs_word_cloud_df = morpheme_df(cs_hist_df_edit)
incident_word_cloud_df = morpheme_df(cs_incident_df)

In [None]:
# 比率計算関数の宣言
def calc_rate(pop_df, samp_df):
  rate_df = pd.merge(pop_df, samp_df, on = 'Word', how = 'inner')
  rate_df.columns = ['Word', 'CS Count', 'Incident Count']
  rate_df['Rate'] = rate_df['Incident Count'] / rate_df['CS Count'] * 100
  rate_df['Rate'] = rate_df['Rate'].round(2)
  rate_df = rate_df[rate_df['CS Count'] >= 10].sort_values(['Rate', 'CS Count'], ascending = False).reset_index(drop = True)
  return rate_df

In [None]:
# 比率計算関数の実行
cs_incident_rate_df = calc_rate(cs_word_cloud_df, incident_word_cloud_df)

# ワードクラウド（Word Cloud）

In [None]:
# ワードクラウド作成関数の宣言
def word_cloud(word_cloud_df, freq):
  # ワードクラウド用の辞書を作成
  word_freq = dict(zip(word_cloud_df['Word'], word_cloud_df[freq]))

  # ワードクラウドを作成
  wordcloud = WordCloud(
    background_color = "white",
    width = 800,
    height = 800,
    font_path = '/usr/share/fonts/truetype/fonts-japanese-gothic.ttf',
    colormap = 'viridis',
    max_words = 50,
    ).generate_from_frequencies(word_freq)
  return wordcloud

In [None]:
# ワードクラウド作成関数の実行
word_cloud_cs = word_cloud(cs_word_cloud_df, 'Count')
word_cloud_incident = word_cloud(incident_word_cloud_df, 'Count')
word_cloud_rate = word_cloud(cs_incident_rate_df, 'Rate')

# モデルの並列表示
fig, axs = plt.subplots(1, 3, figsize=(10, 5))
jp_font = fm.FontProperties(fname = '/usr/share/fonts/truetype/fonts-japanese-gothic.ttf')

# 各サブプロットにデータをプロット
axs[0].imshow(word_cloud_cs, interpolation = 'bilinear')
axs[0].set_title('顧客対応履歴', fontproperties = jp_font)
axs[0].axis('off')

axs[1].imshow(word_cloud_incident, interpolation = 'bilinear')
axs[1].set_title('インシデント', fontproperties = jp_font)
axs[1].axis('off')

axs[2].imshow(word_cloud_rate, interpolation = 'bilinear')
axs[2].set_title('インシデント率', fontproperties = jp_font)
axs[2].axis('off')

# モデルの表示
plt.tight_layout()
plt.show()

# データフレームの表示
def display_side_by_side(*display_dfs):
  html_str = ''
  for df in display_dfs:
    df.index = np.arange(1, len(df)+1)
    html_str += df.head(50).to_html() + "\t"
  display(HTML('<div style="display: flex; justify-content: space-around;">' + html_str + '</div>'))

display_side_by_side(cs_word_cloud_df, incident_word_cloud_df, cs_incident_rate_df)