In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [46]:
class Analysis:
    def __init__(self, data):
        self.data = data

    def analyze_sales(self):
      # 计算总销售额
      sales_summary = self.data['Sales'].sum()

      # 计算均值
      sales_mean = round(self.data['Sales'].mean(), 2)

      # 计算标准差
      sales_std = round(self.data['Sales'].std(), 2)

      # 计算最小值
      sales_min = self.data['Sales'].min()

      # 计算最大值
      sales_max = self.data['Sales'].max()

      # 计算25%、50%、75%的分位数（即分位数）
      sales_percentiles = self.data['Sales'].quantile([0.25, 0.5, 0.75]).round(2)

      # 返回字典形式的所有统计结果
      return {
          'sales_summary': sales_summary,
          'sales_mean': sales_mean,
          'sales_std': sales_std,
          'sales_min': sales_min,
          'sales_max': sales_max,
          'sales_25th_percentile': sales_percentiles[0.25],
          'sales_50th_percentile': sales_percentiles[0.5],  # 中位数
          'sales_75th_percentile': sales_percentiles[0.75]
      }

    def analyze_quantity(self):
        # 直接计算总数量，不分组
        quantity_summary = self.data['Quantity'].sum()
        # 计算均值
        quantity_mean = round(self.data['Quantity'].mean(), 2)

        # 计算标准差
        quantity_std = round(self.data['Quantity'].std(), 2)

        # 计算最小值
        quantity_min = self.data['Quantity'].min()

        # 计算最大值
        quantity_max = self.data['Quantity'].max()

        # 计算25%、50%、75%的分位数（即分位数）
        quantity_percentiles = self.data['Quantity'].quantile([0.25, 0.5, 0.75]).round(2)

        # 返回字典形式的所有统计结果
        return {
            'quantity_summary': quantity_summary,
            'quantity_mean': quantity_mean,
            'quantity_std': quantity_std,
            'quantity_min': quantity_min,
            'quantity_max': quantity_max,
            'quantity_25th_percentile': quantity_percentiles[0.25],
            'quantity_50th_percentile': quantity_percentiles[0.5],  # 中位数
            'quantity_75th_percentile': quantity_percentiles[0.75]
        }

    def analyze_price(self):
        # 直接计算平均价格，不分组
        avg_price = round(self.data['Price'].mean(), 2)
        # 计算标准差
        price_std = round(self.data['Price'].std(), 2)

        # 计算最小值
        price_min = self.data['Price'].min()

        # 计算最大值
        price_max = self.data['Price'].max()

        # 计算25%、50%、75%的分位数（即分位数）
        price_percentiles = self.data['Price'].quantile([0.25, 0.5, 0.75]).round(2)

        # 返回字典形式的所有统计结果
        return {
            'avg_price': avg_price,
            'price_std': price_std,
            'price_min': price_min,
            'price_max': price_max,
            'price_25th_percentile': price_percentiles[0.25],
            'price_50th_percentile': price_percentiles[0.5],  # 中位数
            'price_75th_percentile': price_percentiles[0.75]
        }

    def analyze_year_distribution(self):
        # 仅按 Year 分组
        year_dist = self.data.groupby('Year').size()
        year_dist_percentage = (year_dist / year_dist.sum() * 100).round(2)

        # 转换为字典格式
        result = {
            str(year): {
                'count': int(count),
                'percentage': float(percentage)
            }
            for year, count, percentage in zip(
                year_dist.index,
                year_dist,
                year_dist_percentage
            )
        }
        return {'year_distribution': result}

    def analyze_month_distribution(self):
        """
        分析月份分布
        :return: 包含计数和百分比的字典
        """
        month_dist = self.data['Month'].value_counts().sort_index()  # Ensure sorting by month
        month_percentage = (month_dist / month_dist.sum() * 100).round(2)

        month_dict = {
            month: {
                'count': int(count),
                'percentage': float(percentage)
            }
            for month, count, percentage in zip(
                month_dist.index,
                month_dist,
                month_percentage
            )
        }

        return {'month_distribution': month_dict}

    def analyze_distributor_distribution(self):
        # 计算分销商分布
        distributor_dist = self.data['Distributor'].value_counts()
        distributor_percentage = (distributor_dist / distributor_dist.sum() * 100).round(2)

        distributor_dict = {
            distributor: {
                'count': int(count),
                'percentage': float(percentage)
            }
            for distributor, count, percentage in zip(
                distributor_dist.index,
                distributor_dist,
                distributor_percentage
            )
        }
        return {'distributor_distribution': distributor_dict}

    def analyze_customer_distribution(self):
        # 计算客户分布
        # 不适用
        customer_dist = self.data['Customer Name'].value_counts()
        customer_percentage = (customer_dist / customer_dist.sum() * 100).round(2)

        customer_dict = {
            customer: {
                'count': int(count),
                'percentage': float(percentage)
            }
            for customer, count, percentage in zip(
                customer_dist.index,
                customer_dist,
                customer_percentage
            )
        }
        return {'customer_distribution': customer_dict}

    def analyze_city_distribution(self):
        # 不适用
        city_dist = self.data['City'].value_counts()
        city_percentage = (city_dist / city_dist.sum() * 100).round(2)

        city_dict = {
            city: {
                'count': int(count),
                'percentage': float(percentage)
            }
            for city, count, percentage in zip(
                city_dist.index,
                city_dist,
                city_percentage
            )
        }
        return {'city_distribution': city_dict}

    def analyze_channel_distribution(self):
        channel_dist = self.data['Channel'].value_counts()
        channel_percentage = (channel_dist / channel_dist.sum() * 100).round(2)

        channel_dict = {
            channel: {
                'count': int(count),
                'percentage': float(percentage)
            }
            for channel, count, percentage in zip(
                channel_dist.index,
                channel_dist,
                channel_percentage
            )
        }
        return {'channel_distribution': channel_dict}

    def analyze_channel_subchannel_distribution(self):
        # 联合分布分析：Channel × Sub-channel
        cross_dist = self.data.groupby(['Channel', 'Sub-channel']).size().unstack(fill_value=0)

        # 计算各Channel内Sub-channel的百分比分布
        channel_subchannel_percentage = cross_dist.div(cross_dist.sum(axis=1), axis=0) * 100

        # 转换为嵌套字典格式
        result = {}
        for channel in cross_dist.index:
            result[channel] = {}
            for subchannel in cross_dist.columns:
                if int(cross_dist.loc[channel, subchannel]) != 0:
                  result[channel][subchannel] = {
                      'count': int(cross_dist.loc[channel, subchannel]),
                      'percentage': round(channel_subchannel_percentage.loc[channel, subchannel], 2)
                  }

        return {'channel_subchannel_distribution': result}

    def analyze_product_distribution(self, top_n=20):
        """分析产品名称分布，默认显示前20个产品"""
        #不适用
        product_dist = self.data['Product Name'].value_counts()
        product_percentage = (product_dist / product_dist.sum() * 100).round(2)

        # 获取前top_n个产品
        top_products = product_dist.head(top_n)
        other_count = product_dist.sum() - top_products.sum()
        other_percentage = (other_count / product_dist.sum() * 100).round(2)

        product_dict = {
            product: {
                'count': int(count),
                'percentage': float(percentage)
            }
            for product, count, percentage in zip(
                top_products.index,
                top_products,
                product_percentage.head(top_n)
            )
        }

        # 添加"其他产品"类别
        if other_count > 0:
            product_dict["Other Products"] = {
                'count': int(other_count),
                'percentage': float(other_percentage)
            }

        return {'product_distribution': product_dict}

    def analyze_sales_rep_distribution(self, top_n=15):
        """
        分析销售代表分布
        :param top_n: 显示前多少位销售代表，其余归类为"Other Reps"
        :return: 包含计数和百分比的字典
        """
        rep_dist = self.data['Name of Sales Rep'].value_counts()
        rep_percentage = (rep_dist / rep_dist.sum() * 100).round(2)

        # 获取前top_n位销售代表
        top_reps = rep_dist.head(top_n)
        other_count = rep_dist.sum() - top_reps.sum()
        other_percentage = (other_count / rep_dist.sum() * 100).round(2)

        rep_dict = {
            rep: {
                'count': int(count),
                'percentage': float(percentage),
                'performance_rank': rank + 1  # 添加绩效排名(按订单量)
            }
            for rank, (rep, count, percentage) in enumerate(zip(
                top_reps.index,
                top_reps,
                rep_percentage.head(top_n)
            ))
        }

        # 添加"其他代表"类别
        if other_count > 0:
            rep_dict["Other Reps"] = {
                'count': int(other_count),
                'percentage': float(other_percentage),
                'performance_rank': None
            }

        return {'sales_rep_distribution': rep_dict}

    def analyze_manager_distribution(self):
        """
        分析经理分布
        :return: 包含计数和百分比的字典
        """
        manager_dist = self.data['Manager'].value_counts()
        manager_percentage = (manager_dist / manager_dist.sum() * 100).round(2)

        manager_dict = {
            manager: {
                'count': int(count),
                'percentage': float(percentage)
            }
            for manager, count, percentage in zip(
                manager_dist.index,
                manager_dist,
                manager_percentage
            )
        }

        return {'manager_distribution': manager_dict}

    def analyze_sales_team_distribution(self):
        """
        分析销售团队分布
        :return: 包含计数和百分比的字典
        """
        sales_team_dist = self.data['Sales Team'].value_counts()
        sales_team_percentage = (sales_team_dist / sales_team_dist.sum() * 100).round(2)

        sales_team_dict = {
            team: {
                'count': int(count),
                'percentage': float(percentage)
            }
            for team, count, percentage in zip(
                sales_team_dist.index,
                sales_team_dist,
                sales_team_percentage
            )
        }

        return {'sales_team_distribution': sales_team_dict}

    def analyze_all(self):
        return {
            **self.analyze_year_distribution(),
            **self.analyze_month_distribution(),
            **self.analyze_sales(),
            **self.analyze_quantity(),
            **self.analyze_price(),
            **self.analyze_distributor_distribution(),
            **self.analyze_channel_distribution(),
            **self.analyze_channel_subchannel_distribution(),
            **self.analyze_sales_rep_distribution(),
            **self.analyze_manager_distribution(),
            **self.analyze_sales_team_distribution(),
            #**self.analyze_product_distribution(),
            #**self.analyze_city_distribution(),
            #**self.analyze_customer_distribution()
        }

In [3]:
data = pd.read_csv('./data/Pharm Data_Data.csv')
data['Product Class-Country'] = data['Product Class'] + '-' + data['Country']

In [5]:
data.head()

Unnamed: 0,Distributor,Customer Name,City,Country,Latitude,Longitude,Channel,Sub-channel,Product Name,Product Class,Quantity,Price,Sales,Month,Year,Name of Sales Rep,Manager,Sales Team,Product Class-Country
0,Gottlieb-Cruickshank,"Zieme, Doyle and Kunze",Lublin,Poland,51.2333,22.5667,Hospital,Private,Topipizole,Mood Stabilizers,4.0,368,1472.0,January,2018,Mary Gerrard,Britanny Bold,Delta,Mood Stabilizers-Poland
1,Gottlieb-Cruickshank,Feest PLC,Świecie,Poland,53.4167,18.4333,Pharmacy,Retail,Choriotrisin,Antibiotics,7.0,591,4137.0,January,2018,Jessica Smith,Britanny Bold,Delta,Antibiotics-Poland
2,Gottlieb-Cruickshank,Medhurst-Beer Pharmaceutical Limited,Rybnik,Poland,50.0833,18.5,Pharmacy,Institution,Acantaine,Antibiotics,30.0,66,1980.0,January,2018,Steve Pepple,Tracy Banks,Bravo,Antibiotics-Poland
3,Gottlieb-Cruickshank,Barton Ltd Pharma Plc,Czeladź,Poland,50.3333,19.0833,Hospital,Private,Lioletine Refliruvax,Analgesics,6.0,435,2610.0,January,2018,Mary Gerrard,Britanny Bold,Delta,Analgesics-Poland
4,Gottlieb-Cruickshank,Keeling LLC Pharmacy,Olsztyn,Poland,53.78,20.4942,Pharmacy,Retail,Oxymotroban Fexoformin,Analgesics,20.0,458,9160.0,January,2018,Anne Wu,Britanny Bold,Delta,Analgesics-Poland


In [47]:
for category in data['Product Class-Country'].unique():
    print(f'Processing: {category}')
    if not os.path.exists(f'./analysis_result_{category}'):
        os.makedirs(f'./analysis_result_{category}')
    category_data = data[data['Product Class-Country'] == category]
    analysis = Analysis(category_data)
    analysis_result = analysis.analyze_all()
    #print(analysis_result)
    with open(f'./analysis_result_{category}/analysis_result_{category}.json', 'w') as f:
        json.dump(analysis_result, f, indent=4)

Processing: Mood Stabilizers-Poland
{'year_distribution': {'2018': {'count': 7307, 'percentage': 100.0}}, 'month_distribution': {'April': {'count': 460, 'percentage': 6.3}, 'August': {'count': 675, 'percentage': 9.24}, 'December': {'count': 582, 'percentage': 7.96}, 'February': {'count': 616, 'percentage': 8.43}, 'January': {'count': 641, 'percentage': 8.77}, 'July': {'count': 606, 'percentage': 8.29}, 'June': {'count': 504, 'percentage': 6.9}, 'March': {'count': 749, 'percentage': 10.25}, 'May': {'count': 521, 'percentage': 7.13}, 'November': {'count': 664, 'percentage': 9.09}, 'October': {'count': 725, 'percentage': 9.92}, 'September': {'count': 564, 'percentage': 7.72}}, 'sales_summary': np.float64(121097293.0), 'sales_mean': np.float64(16572.78), 'sales_std': 75620.76, 'sales_min': -14915.0, 'sales_max': 2204000.0, 'sales_25th_percentile': np.float64(962.0), 'sales_50th_percentile': np.float64(2600.0), 'sales_75th_percentile': np.float64(8350.0), 'quantity_summary': np.float64(2993