In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, silhouette_score
from sklearn.model_selection import train_test_split

import sys
print(sys.executable)

F:\anaconda3\envs\arin7102\python.exe


In [13]:
class Analysis:
    def __init__(self, data):
        self.data = data

    def analyze_sales(self):
        '''
        销售额分析
        '''
        # 计算总销售额
        sales_summation = self.data['Sales'].sum()
        # 计算均值
        sales_mean = round(self.data['Sales'].mean(), 3)
        # 计算标准差
        sales_std = round(self.data['Sales'].std(), 3)
        # 计算最小值
        sales_min = self.data['Sales'].min()
        # 计算最大值
        sales_max = self.data['Sales'].max()
        # 计算25%、50%、75%的分位数（即分位数）
        sales_percentiles = self.data['Sales'].quantile([0.25, 0.5, 0.75]).round(3)
        
        result = {
            'sales_summation': sales_summation,
            'sales_mean': sales_mean,
            'sales_std': sales_std,
            'sales_min': sales_min,
            'sales_max': sales_max,
            'sales_percentiles': {
                '25%': sales_percentiles[0.25],
                '50%': sales_percentiles[0.5],  # 中位数
                '75%': sales_percentiles[0.75]
            }
        }
        # 返回字典形式的所有统计结果
        return {'sales_summary': result}

    def analyze_quantity(self):
        '''
        数量分析
        '''
        # 直接计算总数量，不分组
        quantity_summation = self.data['Quantity'].sum()
        # 计算均值
        quantity_mean = round(self.data['Quantity'].mean(), 3)
        # 计算标准差
        quantity_std = round(self.data['Quantity'].std(), 3)
        # 计算最小值
        quantity_min = self.data['Quantity'].min()
        # 计算最大值
        quantity_max = self.data['Quantity'].max()
        # 计算25%、50%、75%的分位数（即分位数）
        quantity_percentiles = self.data['Quantity'].quantile([0.25, 0.5, 0.75]).round(3)
        
        
        result = {
            'quantity_summation': quantity_summation,
            'quantity_mean': quantity_mean,
            'quantity_std': quantity_std,
            'quantity_min': quantity_min,
            'quantity_max': quantity_max,
            'quantity_percentiles': {
                '25%': quantity_percentiles[0.25],
                '50%': quantity_percentiles[0.5],  # 中位数
                '75%': quantity_percentiles[0.75]
            }
        }
        
        # 返回字典形式的所有统计结果
        return {'quantity_summary': result}

    def analyze_price(self):
        '''
        价格分析
        '''
        # 直接计算平均价格，不分组
        price_avg = round(self.data['Price'].mean(), 3)
        # 计算标准差
        price_std = round(self.data['Price'].std(), 3)
        # 计算最小值
        price_min = self.data['Price'].min()
        # 计算最大值
        price_max = self.data['Price'].max()
        # 计算25%、50%、75%的分位数（即分位数）
        price_percentiles = self.data['Price'].quantile([0.25, 0.5, 0.75]).round(3)

        result = {
            'price_avg': price_avg,
            'price_std': price_std,
            'price_min': price_min,
            'price_max': price_max,
            'price_percentiles': {
                '25%': price_percentiles[0.25],
                '50%': price_percentiles[0.5],  # 中位数
                '75%': price_percentiles[0.75]
            }
        }
        
        return {'price_summary': result}


    def analyze_year_distribution(self):
        '''
        年份分布分析
        '''
        # 仅按 Year 分组
        year_dist = self.data.groupby('Year').size()
        year_dist_percentage = (year_dist / year_dist.sum() * 100).round(3)

        # 转换为字典格式
        result = {
            str(year): {
                'count': int(count),
                'percentage': float(percentage)
            }
            for year, count, percentage in zip(
                year_dist.index,
                year_dist,
                year_dist_percentage
            )
        }
        return {'year_distribution': result}

    
    def analyze_month_distribution(self):
        '''
        月份分布分析
        '''
        month_dist = self.data['Month'].value_counts().sort_index()  # Ensure sorting by month
        month_percentage = (month_dist / month_dist.sum() * 100).round(3)

        month_dict = {
            month: {
                'count': int(count),
                'percentage': float(percentage)
            }
            for month, count, percentage in zip(
                month_dist.index,
                month_dist,
                month_percentage
            )
        }

        return {'month_distribution': month_dict}


    def analyze_distributor_performance(self, top_n=10):

        if 'Distributor' not in self.data.columns or 'Sales' not in self.data.columns:
            return {'distributor_performance': {'error': "Required columns ('Distributor', 'Sales') not found."}}

        distributor_sales = self.data.groupby('Distributor')['Sales'].sum().nlargest(top_n)

        if distributor_sales.empty:
            return {'distributor_performance': {}}

        total_sales = self.data['Sales'].sum()
        result = {}
        if total_sales > 0:
            sales_percentage = (distributor_sales / total_sales * 100).round(3)
            result = {
                distributor: {
                    'total_sales': round(float(sales), 2),
                    'percentage_of_total_sales': float(percentage)
                }
                for distributor, sales, percentage in zip(
                    distributor_sales.index,
                    distributor_sales,
                    sales_percentage
                )
            }
        else: # Handle case where total_sales is zero
             result = {
                distributor: {
                    'total_sales': round(float(sales), 2),
                    'percentage_of_total_sales': 0.0
                }
                for distributor, sales in distributor_sales.items()
            }

        return {f'top_{top_n}_distributor_performance': result}


    def analyze_top_sales_reps(self, top_n=15):
        """
        分析销售代表分布

        :param top_n: 显示前多少位销售代表，其余归类为"Other Reps"

        :return: 包含计数和百分比的字典
        """
        rep_dist = self.data['Name of Sales Rep'].value_counts()
        rep_percentage = (rep_dist / rep_dist.sum() * 100).round(3)
        
        rep_team_mapping = (
            self.data[['Name of Sales Rep', 'Sales Team']]
            .drop_duplicates(subset='Name of Sales Rep', keep='first')
            .set_index('Name of Sales Rep')['Sales Team']
            .to_dict()
        )

        # 获取前top_n位销售代表
        top_reps = rep_dist.head(top_n)
        other_count = rep_dist.sum() - top_reps.sum()
        other_percentage = (other_count / rep_dist.sum() * 100).round(3)

        rep_dict = {
            rep: {
                'Sales Team': rep_team_mapping.get(rep, None),
                'count': int(count),
                'percentage': float(percentage),
                'performance_rank': rank + 1  # 添加绩效排名(按订单量)
            }
            for rank, (rep, count, percentage) in enumerate(zip(
                top_reps.index,
                top_reps,
                rep_percentage.head(top_n)
            ))
        }

        # 添加"其他代表"类别
        if other_count > 0:
            rep_dict["Other Reps"] = {
                'count': int(other_count),
                'percentage': float(other_percentage),
                'performance_rank': None
            }

        return {'sales_rep_distribution': rep_dict}

    
    
    def analyze_channels(self):
        """
        分析渠道表现（包含全局占比）
        """
        # 计算全局总量
        global_sales = self.data['Sales'].sum()
        global_orders = self.data['Customer Name'].count()
        
        # 计算渠道数据
        channel_total = self.data.groupby('Channel').agg(
            channel_sales=('Sales', 'sum'),
            channel_orders=('Customer Name', 'count')
        ).reset_index()
        
        # 计算子渠道数据
        subchannel_detail = self.data.groupby(['Channel', 'Sub-channel']).agg(
            sub_sales=('Sales', 'sum'),
            sub_orders=('Customer Name', 'count')
        ).reset_index()
        
        # 合并计算各级占比
        merged = pd.merge(subchannel_detail, channel_total, on='Channel')
        # 主渠道全局占比
        merged['channel_sales_pct'] = (merged['channel_sales'] / global_sales * 100).round(3)
        merged['channel_orders_pct'] = (merged['channel_orders'] / global_orders * 100).round(3)
        # 子渠道局部占比
        merged['sub_sales_pct'] = (merged['sub_sales'] / merged['channel_sales'] * 100).round(3)
        merged['sub_orders_pct'] = (merged['sub_orders'] / merged['channel_orders'] * 100).round(3)
        
        # 构建嵌套结构
        result = {}
        for _, row in merged.iterrows():
            channel = str(row['Channel'])
            sub = str(row['Sub-channel'])
            
            if channel not in result:
                result[channel] = {
                    'global_sales_share': float(row['channel_sales_pct']),  # 主渠道全局销售占比
                    'global_order_share': float(row['channel_orders_pct']),  # 主渠道全局订单占比
                    'total_sales': float(round(row['channel_sales'], 2)),
                    'total_orders': int(row['channel_orders']),
                    'sub_channels': {}
                }
                
            result[channel]['sub_channels'][sub] = {
                'sub_sales': float(round(row['sub_sales'], 2)),
                'sub_orders': int(row['sub_orders']),
                'channel_sales_share': float(row['sub_sales_pct']),  # 子渠道在所属主渠道的销售占比
                'channel_order_share': float(row['sub_orders_pct']),  # 子渠道在所属主渠道的订单占比
                'global_sales_share': float((row['sub_sales'] / global_sales * 100).round(3)),  # 子渠道全局销售占比
                'global_order_share': float((row['sub_orders'] / global_orders * 100).round(3))  # 子渠道全局订单占比
            }
        
        return {'channel_performance': result}

    
    def analyze_sales_trend(self):
        """
        分析销售额每月变化趋势

        :return: timeline：时间线；sales：销售额；order_counts：订单数量
        """
        trend = self.data.groupby('YearMonth')['Sales'].agg(['sum', 'count']).reset_index()
        trend['month_str'] = trend['YearMonth'].dt.strftime('%Y-%m')
        
        return {
            'sales_trend': {
                'timeline': trend['month_str'].tolist(),
                'sales': trend['sum'].round(2).tolist(),
                'order_counts': trend['count'].tolist()
            }
        }


    def analyze_pricing(self):
        """价格敏感度分析"""
        price_bins = pd.cut(self.data['Price'], bins=5)
        price_analysis = self.data.groupby(price_bins, observed=True).agg(
            total_quantity=('Quantity', 'sum'),
            total_sales=('Sales', 'sum'),
            product_count=('Product Name', pd.Series.nunique)
        )
        
        return {
            'price_sensitivity': {
                str(interval): {
                    'total_quantity': int(row['total_quantity']),
                    'total_sales': float(round(row['total_sales'], 2)),
                    'product_count': int(row['product_count'])
                }
                for interval, row in price_analysis.iterrows()
            }
        }


    def analyze_teams(self):
        """
        分析销售团队绩效
        """
        # 计算全局总量
        global_sales = self.data['Sales'].sum()
        global_orders = self.data['Customer Name'].count()
        
        # 团队维度聚合
        team_stats = self.data.groupby(['Sales Team', 'Manager']).agg(
            total_sales=('Sales', 'sum'),
            total_orders=('Customer Name', 'count'),
            unique_customers=('Customer Name', pd.Series.nunique),
            unique_reps=('Name of Sales Rep', pd.Series.nunique)
        ).reset_index()
        
        # 计算各项占比
        team_stats['sales_percentage'] = (team_stats['total_sales'] / global_sales * 100).round(3)
        team_stats['order_percentage'] = (team_stats['total_orders'] / global_orders * 100).round(3)
        
        result = {}
        for _, row in team_stats.iterrows():
            team_name = str(row['Sales Team'])
            result[team_name] = {
                'manager': str(row['Manager']),
                'total_sales': float(round(row['total_sales'], 2)),
                'sales_percentage': float(row['sales_percentage']),  # 销售额全局占比
                'total_orders': int(row['total_orders']),
                'order_percentage': float(row['order_percentage']),  # 订单量全局占比
                'avg_order_value': float(round(row['total_sales'] / row['total_orders'], 2)),
                'unique_customers': int(row['unique_customers']),
                'unique_reps': int(row['unique_reps'])  # 销售代表数量
            }
        
        return {'team_performance': result}


    def analyze_average_transaction_value(self):
        """
        计算并分析平均交易额（单笔销售记录的平均 Sales 值）
        也可以按不同维度（如渠道、产品类别）分析

        :return: 包含总体平均交易额和按渠道的平均交易额的字典
        """
        overall_avg_sales = self.data['Sales'].mean()
        channel_avg_sales = self.data.groupby('Channel')['Sales'].mean().sort_values(ascending=False)

        channel_avg_dict = {
            channel: round(float(avg_val), 2)
            for channel, avg_val in channel_avg_sales.items()
        }

        return {
            'average_transaction_value': {
                'overall': round(float(overall_avg_sales), 2),
                'by_channel': channel_avg_dict
            }
        }


    def analyze_quantity_vs_price(self, top_n=5):
        """
        分析销量最高和平均价格最高/最低的产品 (提供一些关于产品定位的洞察)

        :param top_n: 返回排名前 N 的产品数量
        :return: 包含高销量、高价、低价产品信息的字典
        """
        # 按总销量排名
        top_quantity_products = self.data.groupby('Product Name')['Quantity'].sum().nlargest(top_n)
        # 计算每个产品的平均售价
        avg_price_products = self.data.groupby('Product Name')['Price'].mean()

        # 平均售价最高的产品
        highest_avg_price = avg_price_products.nlargest(top_n)
        # 平均售价最低的产品
        lowest_avg_price = avg_price_products.nsmallest(top_n)

        result = {
            f'top_{top_n}_products_by_quantity': {
                prod: int(qty) for prod, qty in top_quantity_products.items()
            },
            f'top_{top_n}_products_by_highest_avg_price': {
                prod: round(float(price), 2) for prod, price in highest_avg_price.items()
            },
            f'top_{top_n}_products_by_lowest_avg_price': {
                prod: round(float(price), 2) for prod, price in lowest_avg_price.items()
            }
        }
        return {'quantity_vs_price_insights': result}


    def analyze_top_customers(self, top_n=10):
        """
        分析销售额最高的客户

        :param top_n: 返回排名前 N 的客户数量
        :return: 包含客户名和总销售额的字典
        """
        top_customers = self.data.groupby('Customer Name')['Sales'].sum().nlargest(top_n)
        result = {
            customer: round(float(sales), 2)
            for customer, sales in top_customers.items()
        }
        return {f'top_{top_n}_customers_by_sales': result}
    

    def analyze_customer_rfm(self):
        """
        客户RFM分析（Recency, Frequency, Monetary）
        """
        latest_month = self.data['YearMonth'].max()
        rfm_data = self.data.groupby('Customer Name', observed=True).agg({
            'YearMonth': lambda x: (latest_month - x.max()).n,  # Recency: 距离最近一次购买的月份间隔
            'Quantity': 'count',           # Frequency: 购买次数
            'Sales': 'sum'                 # Monetary: 总销售额
        }).reset_index()
        rfm_data.columns = ['Customer', 'Recency', 'Frequency', 'Monetary']

        quantiles = rfm_data[['Recency', 'Frequency', 'Monetary']].quantile(q=[0.2, 0.4, 0.6, 0.8]).to_dict()

        def r_score(x):
            if x <= quantiles['Recency'][0.2]: return 5
            elif x <= quantiles['Recency'][0.4]: return 4
            elif x <= quantiles['Recency'][0.6]: return 3
            elif x <= quantiles['Recency'][0.8]: return 2
            else: return 1

        def fm_score(x, col):
            if x <= quantiles[col][0.2]: return 1
            elif x <= quantiles[col][0.4]: return 2
            elif x <= quantiles[col][0.6]: return 3
            elif x <= quantiles[col][0.8]: return 4
            else: return 5

        rfm_data['R_Score'] = rfm_data['Recency'].apply(r_score)
        rfm_data['F_Score'] = rfm_data['Frequency'].apply(lambda x: fm_score(x, 'Frequency'))
        rfm_data['M_Score'] = rfm_data['Monetary'].apply(lambda x: fm_score(x, 'Monetary'))
        rfm_data['RFM_Score'] = rfm_data['R_Score'] + rfm_data['F_Score'] + rfm_data['M_Score']

        rfm_data['Segment'] = pd.qcut(rfm_data['RFM_Score'], q=3, labels=['low', 'medium', 'high'])

        segment_counts = rfm_data['Segment'].value_counts(normalize=True)

        segment_analysis = rfm_data.groupby('Segment', observed=True).agg({
            'Recency': 'mean',
            'Frequency': 'mean',
            'Monetary': 'mean',
            'Customer': 'count'
        }).rename(columns={'Customer': 'Customer Count'})

        result = {
            'value_ratio': segment_counts.to_dict(),
            'value_analysis': {
                str(segment): {
                    'Recency': round(float(row['Recency']), 2),
                    'Frequency': round(float(row['Frequency']), 2),
                    'Monetary': round(float(row['Monetary']), 2),
                    'Customer Count': int(row['Customer Count'])
                } for segment, row in segment_analysis.iterrows()
            }
        }
        return {'customer_rfm_analysis': result}
    

    def analyze_top_products(self, top_n=10):
        """
        分析销售额最高的产品

        :param top_n: 返回排名前 N 的产品数量
        :return: 包含产品名和总销售额的字典
        """
        top_products = self.data.groupby('Product Name', observed=True)['Sales'].sum().nlargest(top_n)
        total_sales = self.data['Sales'].sum()
        
        result = {
            product: {
                'total_sales': round(float(sales), 2),
                'percentage': float((sales / total_sales * 100).round(3))
            }
            for product, sales in top_products.items()
        }
        return {f'top_{top_n}_products_by_sales': result}
    

    def analyze_geo_preference(self, top_n=10):
        '''
        分析药品的地区偏好

        :param top_n: 排名前n的城市
        '''
        country = self.data['Country'].unique()[0]
        product_class = self.data['Product Class'].unique()[0]

        result = self.data.groupby('City')['Sales'].sum().sort_values(ascending=False).head(top_n)
        result = {
            city: {
                'total_sales': round(float(sales), 2),
                'percentage': float((sales / self.data['Sales'].sum() * 100).round(3))
            }
            for city, sales in result.items()
        }

        return {f"top_{top_n}_cities_in_{country}_by_sales_of_{product_class}": result}
    

    def analyze_sales_regression(self):
        """
        销售额回归分析
        """
        features = self.data[[
            'Quantity', 
            'Price', 
            'Channel',
            'Sub-channel',
            'Month', 
            'Sales Team'
        ]]
        feature_cols = features.columns.tolist()

        target = self.data['Sales']
        categorical_cols = ['Channel', 'Sub-channel', 'Month', 'Sales Team']
        features_encoded = pd.get_dummies(features, columns=categorical_cols)

        numeric_features = features_encoded[['Quantity', 'Price']]
        correlation_matrix = numeric_features.corr()

        X_train, X_test, y_train, y_test = train_test_split(
            features_encoded, 
            target, 
            test_size=0.2, 
            random_state=42
        )

        # 初始化模型
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)

        coefficients = pd.DataFrame({
            'Feature': features_encoded.columns,
            'Coefficient': model.coef_
        })

        coefficients = coefficients.sort_values('Coefficient', ascending=False)

        result = {
            'r2_score': round(float(r2), 3),
            'mean_squared_error': round(float(mse), 3),
            'coefficients': {
                feature_catagory: {
                    feature: round(float(coef), 3)
                    for feature, coef in zip(
                        coefficients[coefficients['Feature'].str.contains(feature_catagory)]['Feature'].tolist(),
                        coefficients[coefficients['Feature'].str.contains(feature_catagory)]['Coefficient'].tolist()
                    )
                } for feature_catagory in feature_cols
            }
        }
        
        coefficients_abs = coefficients.copy()
        coefficients_abs['Abs_Coefficient'] = coefficients_abs['Coefficient'].abs()
        top_influential = coefficients_abs.sort_values('Abs_Coefficient', ascending=False).head(10)
        
        # 将这些最具影响力的特征添加到结果中
        result['top_influential_features'] = {
            feature: round(float(coef), 3)
            for feature, coef in zip(
                top_influential['Feature'].tolist(),
                top_influential['Coefficient'].tolist()
            )
        }

        return {'sales_regression_analysis': result}
    

    def analyze_clustering(self):
        """
        销售数据聚类分析
        """
        data_copy = self.data.copy()
        features = data_copy[['Quantity', 'Price']]

        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features)

        kmeans = KMeans(n_clusters=2, random_state=42)
        cluster_labels = kmeans.fit_predict(scaled_features)
        score = silhouette_score(scaled_features, cluster_labels)

        data_copy['Cluster'] = cluster_labels

        cluster_centers_original = scaler.inverse_transform(kmeans.cluster_centers_)
        cluster_centers_df = pd.DataFrame(cluster_centers_original, columns=features.columns)
        cluster_centers_df['Cluster'] = range(2)
        cluster_counts = data_copy['Cluster'].value_counts().sort_index()

        result = {
            'cluster_centers': {
                f'Cluster {i}': {
                    'Quantity': round(float(row['Quantity']), 2),
                    'Price': round(float(row['Price']), 2)
                } for i, row in cluster_centers_df.iterrows()
            },
            'cluster_counts': {
                f'Cluster {i}': int(count) for i, count in cluster_counts.items()
            },
            'silhouette_score': round(float(score), 3)
        }

        return {'clustering_analysis': result}

    def analyze_all(self):
        return {
            # 简单时间分布（行数过多，需要时再启用）
            # **self.analyze_year_distribution(),
            # **self.analyze_month_distribution(),

            # 销售数据总览
            **self.analyze_sales(),
            **self.analyze_quantity(),
            **self.analyze_price(),

            # 经销商总体分析
            **self.analyze_distributor_performance(),
            # 销售团队/销售员分析
            **self.analyze_teams(),
            **self.analyze_top_sales_reps(),

            # 每月销售情况趋势（行数过多，需要时再启用）
            # **self.analyze_sales_trend(),

            # 销售渠道分析
            **self.analyze_channels(),
            # 定价分析
            **self.analyze_pricing(),

            # 药品的地区偏好
            **self.analyze_geo_preference(),

            # 单笔平均销售额
            **self.analyze_average_transaction_value(),

            # 销售额最高的客户
            **self.analyze_top_customers(),
            # 客户RFM分析
            **self.analyze_customer_rfm(),

            # 分析销量最高和平均价格最高/最低的产品
            **self.analyze_quantity_vs_price(),
            # 销售额最高的产品
            **self.analyze_top_products(),

            # 销售额回归分析
            **self.analyze_sales_regression(),

            # 销售数据聚类分析
            **self.analyze_clustering()
            
        }

In [3]:
data = pd.read_csv('../data/Pharm Data_Data.csv')

data['Product Class-Country'] = data['Product Class'] + '-' + data['Country']

month_mapping = {
    'January': '01', 'Jan': '01',
    'February': '02', 'Feb': '02', 
    'March': '03', 'Mar': '03',
    'April': '04', 'Apr': '04',
    'May': '05', 
    'June': '06', 'Jun': '06',
    'July': '07', 'Jul': '07',
    'August': '08', 'Aug': '08',
    'September': '09', 'Sep': '09', 'Sept': '09',
    'October': '10', 'Oct': '10',
    'November': '11', 'Nov': '11',
    'December': '12', 'Dec': '12'
}
data['Time'] = data['Year'].astype(str) + '-' + data['Month'].map(month_mapping)
data['Time'] = pd.to_datetime(data['Time'])
data['YearMonth'] = data['Time'].dt.to_period('M')
data.drop(columns=['Time'], inplace=True)

data['Distributor'] = data['Distributor'].str.strip()
data['Customer Name'] = data['Customer Name'].str.strip()

data = data[data['Quantity'] >= 0]

data.head()

Unnamed: 0,Distributor,Customer Name,City,Country,Latitude,Longitude,Channel,Sub-channel,Product Name,Product Class,Quantity,Price,Sales,Month,Year,Name of Sales Rep,Manager,Sales Team,Product Class-Country,YearMonth
0,Gottlieb-Cruickshank,"Zieme, Doyle and Kunze",Lublin,Poland,51.2333,22.5667,Hospital,Private,Topipizole,Mood Stabilizers,4.0,368,1472.0,January,2018,Mary Gerrard,Britanny Bold,Delta,Mood Stabilizers-Poland,2018-01
1,Gottlieb-Cruickshank,Feest PLC,Świecie,Poland,53.4167,18.4333,Pharmacy,Retail,Choriotrisin,Antibiotics,7.0,591,4137.0,January,2018,Jessica Smith,Britanny Bold,Delta,Antibiotics-Poland,2018-01
2,Gottlieb-Cruickshank,Medhurst-Beer Pharmaceutical Limited,Rybnik,Poland,50.0833,18.5,Pharmacy,Institution,Acantaine,Antibiotics,30.0,66,1980.0,January,2018,Steve Pepple,Tracy Banks,Bravo,Antibiotics-Poland,2018-01
3,Gottlieb-Cruickshank,Barton Ltd Pharma Plc,Czeladź,Poland,50.3333,19.0833,Hospital,Private,Lioletine Refliruvax,Analgesics,6.0,435,2610.0,January,2018,Mary Gerrard,Britanny Bold,Delta,Analgesics-Poland,2018-01
4,Gottlieb-Cruickshank,Keeling LLC Pharmacy,Olsztyn,Poland,53.78,20.4942,Pharmacy,Retail,Oxymotroban Fexoformin,Analgesics,20.0,458,9160.0,January,2018,Anne Wu,Britanny Bold,Delta,Analgesics-Poland,2018-01


In [None]:
data.info()

In [14]:
if not os.path.exists('analysis_result/Pharm-data'):
    os.makedirs('analysis_result/Pharm-data')

for category in data['Product Class-Country'].unique():
    print(f'Processing: {category}')
    category_data = data[data['Product Class-Country'] == category]
    analysis = Analysis(category_data)
    analysis_result = analysis.analyze_all()
    #print(analysis_result)
    with open(f'analysis_result/Pharm-data/analysis_result_{category}.json', 'w') as f:
        json.dump(analysis_result, f, indent=4)

Processing: Mood Stabilizers-Poland
Processing: Antibiotics-Poland




Processing: Analgesics-Poland
Processing: Antiseptics-Poland
Processing: Antipiretics-Poland




Processing: Antimalarial-Poland
Processing: Mood Stabilizers-Germany
Processing: Antipiretics-Germany
Processing: Antimalarial-Germany
Processing: Analgesics-Germany
Processing: Antiseptics-Germany
Processing: Antibiotics-Germany
