In [None]:
import pandas as pd
import numpy as np
from glob import glob
import sys

sys.path.append('../../src')
from ecomplexity import ecomplexity
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image
import io

import matplotlib.ticker as ptick
from matplotlib.ticker import MultipleLocator, FixedFormatter, FixedLocator


import networkx as nx
import networkx.algorithms.bipartite as bip

plt.rcParams['font.size'] = 18
plt.rcParams['font.family'] = 'Meiryo'
plt.rcParams['axes.axisbelow'] = True

# 小数点以下 桁数 6
pd.options.display.float_format = '{:.3f}'.format


In [None]:
import initial_condition
from process import weight
from visualize import rank as vr


In [None]:
data_dir = '../../data/processed/internal/tech_comparison/'



In [None]:
path_list = glob(data_dir+'*')
path_list


In [None]:
name_df = pd.read_csv(path_list[0], 
            encoding='utf-8', 
            sep=',')
addr_df = pd.read_csv(path_list[1], 
            encoding='utf-8', 
            sep=',')
display(name_df.head())


In [None]:
name_df[name_df['schmoch5'].str.contains('pharmaceuticals')].drop_duplicates(subset='ipc3', ignore_index=True)

In [None]:
output_dir = '../../output/figures/tech_comparison/'


In [None]:
print(*name_df.query('schmoch35 == "Basic materials chemistry"')['ipc3'].values, sep=', ')

In [None]:
name_df['ipc3'].nunique()

In [None]:
tech_color = {
        'Chemistry, pharmaceuticals': 'tab:red',
        'Electrical engineering': 'tab:blue',
        'Instruments': 'tab:green', 
        'Mechanical engineering, machinery': 'tab:orange',
        'Other fields': 'tab:gray'
    }

fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(12, 8), sharex=True)

ax1.scatter(name_df['schmoch35'], name_df['schmoch35_tci'], color='tab:blue', label='Schmoch（N=35）')
ax1.scatter(name_df['schmoch35'], name_df['ipc3_tci'], color='red', alpha=0.6, label='IPC Class（N=124）')
ax2.set_ylabel('TCI in Regions', fontsize=24, fontweight='bold')
ax2.grid(True, linestyle='--', which='major', axis='x')
# ax2.legend(loc='upper left', fontsize=15, prop={'weight': 'bold'},bbox_to_anchor=(1.05, 0.5), borderaxespad=0)
ax1.legend(loc='upper left', fontsize=15, prop={'weight': 'bold'},bbox_to_anchor=(-0.55, 0.5), borderaxespad=0)
# ax2.text(1.125, 0.75, 'Corporate', fontsize=32, fontweight='bold', transform=ax2.transAxes)
# ax2.xaxis.set_major_locator(MultipleLocator(1))
# ax2.xaxis.set_major_locator(FixedLocator(name_df['schmoch35'].index.to_list()))
    
# ax.yaxis.set_major_formatter(
#     FixedFormatter(
#         [name_conv_dict[name] for name in first_top_sources[member_col].to_list()]
#     )
# )
# ax2.xaxis.set_major_formatter(
#     FixedFormatter(
#         name_df['schmoch35'].to_list()
#     )
# )
ax2.scatter(addr_df['schmoch35'], addr_df['schmoch35_tci'], color='tab:green', label='Schmoch（N=35）')
ax2.scatter(addr_df['schmoch35'], addr_df['ipc3_tci'], color='tab:orange', alpha=0.6, label='IPC Class（N=124）')
ax1.set_ylabel('TCI in Corporations', fontsize=24, fontweight='bold')
ax1.set_yticklabels([int(_) for _ in ax1.get_yticks()], rotation=90)
ax1.grid(True, linestyle='--', which='major', axis='x')
# ax1.legend(loc='upper left', fontsize=15, prop={'weight': 'bold'},bbox_to_anchor=(1.05, 0.5), borderaxespad=0)
ax2.legend(loc='upper left', fontsize=15, prop={'weight': 'bold'},bbox_to_anchor=(-0.55, 0.5), borderaxespad=0)

# ax1.text(1.1375, 0.75, 'Regional', fontsize=32, fontweight='bold', transform=ax2.transAxes)

ax2.set_xticklabels(name_df['schmoch35'].drop_duplicates(), rotation=90)
ax2.set_xlabel('Schmoch', fontsize=24, fontweight='bold', rotation=180)
ax2.set_yticks(range(0, 100+1, 25), range(0, 100+1, 25))
ax2.set_yticklabels([_ for _ in range(0, 100+1, 25)], rotation=90)



# ax.set_xscale('log')
# ax.legend(loc='center left', fontsize=20, bbox_to_anchor=(1.5, 0.5), borderaxespad=0, prop={'weight': 'bold'})
# ax.legend(loc='lower left', fontsize=20, prop={'weight': 'bold'})
fig.savefig(output_dir+'schmoch35_ipc3.png', dpi=400, bbox_inches='tight')
plt.show()


In [None]:
import numpy as np
from scipy.stats import wilcoxon

# データセットの例
# pre_training = np.array([10.0, 8.5, 7.0, 9.5, 6.5, 7.5])
# post_training = np.array([9.0, 8.0, 9.0, 7.0, 6.5, 7.0])
name_df['tci_abs'] = abs(name_df['schmoch35_tci'] - name_df['ipc3_tci'])
addr_df['tci_abs'] = abs(addr_df['schmoch35_tci'] - addr_df['ipc3_tci'])
name_df['schmoch35-ipc3'] = name_df['schmoch35'] + '-' + name_df['ipc3']
addr_df['schmoch35-ipc3'] = addr_df['schmoch35'] + '-' + addr_df['ipc3']
name_addr_df = pd.merge(name_df[['schmoch35-ipc3', 'tci_abs']].rename(columns={'tci_abs':'tci_abs_name'}), addr_df[['schmoch35-ipc3', 'tci_abs']].rename(columns={'tci_abs':'tci_abs_addr'}), on='schmoch35-ipc3', how='inner')
statistic, p_value = wilcoxon(name_addr_df['tci_abs_name'], name_addr_df['tci_abs_addr'])
print(statistic, p_value)

In [None]:
len(name_df)

In [None]:
name_df['tci_abs'].mean(), addr_df['tci_abs'].mean()
name_df['tci_abs'].std(), addr_df['tci_abs'].std()
name_addr_df['tci_abs_name'].plot(kind='hist', bins=int(np.log2(len(name_addr_df))+1), alpha=1, label='Corporate')
name_addr_df['tci_abs_addr'].plot(kind='hist', bins=int(np.log2(len(name_addr_df))+1), alpha=0.8, label='Regional')
plt.xlabel('Absolute difference of TCI\n(IPC Class - Schmoch)', fontsize=24, fontweight='bold')
plt.ylabel('Frequency', fontsize=24)
plt.legend(fontsize=20, prop={'weight': 'bold'})

In [None]:
bubble_df = name_df.drop_duplicates('schmoch35', ignore_index=True)[['schmoch35', 'schmoch5', 'schmoch35_tci']].sort_values('schmoch35_tci', ascending=True)
plt.figure(figsize=(2, 20))
plt.scatter([1]*35, bubble_df['schmoch35'], alpha=0.5)

In [None]:
print(name_df.drop_duplicates(subset='schmoch35')['schmoch35_tci'].mean())
print(name_df.drop_duplicates(subset='schmoch35')['schmoch35_tci'].std())
print(addr_df.drop_duplicates(subset='schmoch35')['schmoch35_tci'].mean())
print(addr_df.drop_duplicates(subset='schmoch35')['schmoch35_tci'].std())
print('***************************')
print(name_df.drop_duplicates(subset='ipc3')['ipc3_tci'].mean())
print(name_df.drop_duplicates(subset='ipc3')['ipc3_tci'].std())
print(addr_df.drop_duplicates(subset='ipc3')['ipc3_tci'].mean())
print(addr_df.drop_duplicates(subset='ipc3')['ipc3_tci'].std())
# name_df.drop_duplicates(subset='schmoch35')['schmoch35_tci'].count()

In [None]:
plt.title('法人-技術で求めたTCI')
name_df.drop_duplicates(subset='schmoch35')['schmoch35_tci'].plot(kind='hist', bins=7, alpha=0.5, label='Schmoch35')
name_df.drop_duplicates(subset='ipc3')['ipc3_tci'].plot(kind='hist', bins=7, alpha=0.5, label='IPC(N=124)')
plt.legend()
plt.show()
plt.title('都市-技術で求めたTCI')
addr_df.drop_duplicates(subset='schmoch35')['schmoch35_tci'].plot(kind='hist', bins=7, alpha=0.5, label='Schmoch35')
addr_df.drop_duplicates(subset='ipc3')['ipc3_tci'].plot(kind='hist', bins=7, alpha=0.5, label='IPC(N=124)')
plt.legend()
plt.show()

In [None]:
plt.scatter([0]*35, name_df.drop_duplicates(subset='schmoch35')['schmoch35_tci'], alpha=0.5)
# plt.show()
plt.scatter([1]*124, name_df.drop_duplicates(subset='ipc3')['ipc3_tci'], alpha=0.5)
# plt.scatter([0]*35, name_df.drop_duplicates(subset='schmoch35')['schmoch35_tci'])
# plt.scatter([0]*35, name_df.drop_duplicates(subset='schmoch35')['schmoch35_tci'])

In [None]:
print(sum(((name_df['schmoch35_tci']-name_df['ipc3_tci'])**2)))
print(sum(((addr_df['schmoch35_tci']-addr_df['ipc3_tci'])**2)))
print(sum(np.sqrt((name_df['schmoch35_tci']-name_df['ipc3_tci'])**2)))
print(sum(np.sqrt((addr_df['schmoch35_tci']-addr_df['ipc3_tci'])**2)))

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# サンプルデータの生成
np.random.seed(0)
data_corporate = pd.DataFrame({
    'Schmoch_TCI': np.random.normal(50, 10, 35),
    'IPC_TCI': np.random.normal(50, 20, 35)
})

data_city = pd.DataFrame({
    'Schmoch_TCI': np.random.normal(50, 15, 35),
    'IPC_TCI': np.random.normal(50, 25, 35)
})

# 法人レベルのデータでの回帰分析
X_corporate = sm.add_constant(name_df['schmoch35_tci'])  # 独立変数
y_corporate = name_df['ipc3_tci']  # 従属変数
model_corporate = sm.OLS(y_corporate, X_corporate).fit()

# 都市レベルのデータでの回帰分析
X_city = sm.add_constant(addr_df['schmoch35_tci'])
y_city = addr_df['ipc3_tci']
model_city = sm.OLS(y_city, X_city).fit()

# 結果の出力
print("法人レベルの回帰モデルの結果:")
print(model_corporate.summary())
print("\n都市レベルの回帰モデルの結果:")
print(model_city.summary())

# 残差プロット
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(name_df['schmoch35_tci'], model_corporate.resid)
plt.title('法人レベルの残差プロット')
plt.xlabel('Schmoch TCI')
plt.ylabel('Residuals')

plt.subplot(1, 2, 2)
plt.scatter(addr_df['schmoch35_tci'], model_city.resid, color='tab:green')
plt.title('都市レベルの残差プロット')
plt.xlabel('Schmoch TCI')
plt.ylabel('Residuals')

plt.tight_layout()
plt.show()


In [None]:

print(name_df['schmoch35_tci'].corr(name_df['ipc3_tci']))
print(addr_df['schmoch35_tci'].corr(addr_df['ipc3_tci']))
name_corr = name_df['schmoch35_tci'].corr(name_df['ipc3_tci'])
addr_corr = addr_df['schmoch35_tci'].corr(addr_df['ipc3_tci'])

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(name_df['schmoch35_tci'], name_df['ipc3_tci'], color='tab:blue', label='Schmoch（N=35）')
plt.title(f'法人レベルの相関分析(corr={name_corr:.3f})')
plt.xlabel('Schmoch TCI')
plt.ylabel('IPC TCI')

plt.subplot(1, 2, 2)
plt.scatter(addr_df['schmoch35_tci'], addr_df['ipc3_tci'], color='tab:green', label='Schmoch（N=35）')
plt.title(f'都市レベルの相関分析(corr={addr_corr:.3f})')
plt.xlabel('Schmoch TCI')
plt.ylabel('IPC TCI')

plt.tight_layout()
plt.show()


In [None]:
tech_color = {
        'Chemistry, pharmaceuticals': 'red',
        'Electrical engineering': 'blue',
        'Instruments': 'green', 
        'Mechanical engineering, machinery': 'orange',
        'Other fields': 'gray'
    }

fig, ax2 = plt.subplots(figsize=(6, 12), sharex=True)

sample = name_df.drop_duplicates(subset='schmoch35', ignore_index=True).sort_values('schmoch35_tci', ascending=True)
for tech in tech_color.keys():
    X = np.ma.masked_where(sample['schmoch5'] != tech, 
                       sample['schmoch35_tci']).filled(np.nan).copy()
    ax2.barh(sample['schmoch35'], X, color=tech_color[tech], label=tech)
# ax2.barh(name_df['schmoch35'][::-1], name_df['schmoch35_tci'][::-1], color='red', label='IPC Class（N=127）')
# ax2.scatter(name_df['schmoch35'], name_df['ipc3_tci'], color='red', alpha=0.5, label='IPC Class（N=127）')
ax2.set_xlabel('TCI', fontsize=24, fontweight='bold')
# ax2.grid(True, linestyle='--', which='major', axis='y')
# ax2.legend(loc='upper left', fontsize=15, prop={'weight': 'bold'},bbox_to_anchor=(1.05, 0.5), borderaxespad=0)
# ax2.legend(loc='upper left', fontsize=15, prop={'weight': 'bold'},bbox_to_anchor=(-0.55, 0.5), borderaxespad=0)

# ax2.set_yticklabels(name_df['schmoch35'].drop_duplicates()[::-1])
# ax2.set_ylabel('Schmoch', fontsize=24, fontweight='bold')
# ax2.text(1.125, 0.75, 'Corporate', fontsize=32, fontweight='bold', transform=ax2.transAxes)
# ax2.xaxis.set_major_locator(MultipleLocator(1))
# ax2.xaxis.set_major_locator(FixedLocator(name_df['schmoch35'].index.to_list()))

# plt.show()


In [None]:
print(*name_df.query('schmoch35 == "Basic materials chemistry"')['ipc3'].values, sep=', ')

In [None]:
print(*name_df.query('schmoch35 == "Basic materials chemistry"')['ipc3'].values, sep=', ')

In [None]:
addr_ipc3_rank_df = addr_df[['ipc3', 'ipc3_tci']].sort_values('ipc3_tci', ascending=False).drop_duplicates(ignore_index=True)
name_ipc3_rank_df = name_df[['ipc3', 'ipc3_tci']].sort_values('ipc3_tci', ascending=False).drop_duplicates(ignore_index=True)

addr_ipc3_rank_df['rank'] = addr_ipc3_rank_df['ipc3_tci'].rank(ascending=False, method='min')
name_ipc3_rank_df['rank'] = name_ipc3_rank_df['ipc3_tci'].rank(ascending=False, method='min')

addr_name_ipc3_rank_df = pd.merge(addr_ipc3_rank_df.rename(columns={'ipc3_tci': 'ipc3_tci_addr', 'rank': 'rank_addr'}), 
                                    name_ipc3_rank_df.rename(columns={'ipc3_tci': 'ipc3_tci_name', 'rank': 'rank_name'}), 
                                    on='ipc3', how='inner')
# pd.merge(addr_df.query('schmoch35 == "Basic materials chemistry"')[['ipc3', 'ipc3_tci']]\
#                 .rename(columns={'ipc3_tci': 'ipc3_tci_addr'}), 
#         name_df.query('schmoch35 == "Basic materials chemistry"')[['ipc3', 'ipc3_tci']]\
#                 .rename(columns={'ipc3_tci': 'ipc3_tci_name'}), 
#         on='ipc3', 
#         how='inner')

In [None]:
fig, ax = plt.subplots(figsize=(12, 8), 
                       , subplot_kw=dict(ylim=(0.5, 0.5 + 35)))

first_top_sources = addr_name_ipc3_rank_df.sort_values('rank_name', ascending=True)
ax.xaxis.set_major_locator(MultipleLocator(1))
ax.yaxis.set_major_locator(FixedLocator(first_top_sources['rank_name'].to_list()))




In [None]:
addr_df.query('schmoch35 == "Texttile and paper machines"')

In [None]:
name_df

In [None]:
vr.rank_doubleaxis(
    # df_dict={'Regional': addr_df.query('schmoch35 == "Basic materials chemistry"'), 'Corporate': name_df.query('schmoch35 == "Basic materials chemistry"')},
    df_dict={
        "Regional": addr_df.query('schmoch35 == "Textile and paper machines"'),
        "Corporate": name_df.query('schmoch35 == "Textile and paper machines"'),
    },
    rank_num=len(addr_df.query('schmoch35 == "Textile and paper machines"')),
    member_col="ipc3",
    value_col="ipc3_tci",
    prop_dict={
        "figsize": (6, 8),
        "xlabel": "",
        "ylabel": "",
        "title": "",
        "fontsize": 20,
        "year_range": 15,
        "ascending": False,
        "color": "default",
    },
)
plt.ylabel("IPC Class TCI Ranking", fontsize=24, fontweight="bold")