In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("C:/Users/cw/Downloads/Pharmaceutical/indian_pharmaceutical_products_cleaned.csv")

company_count = (
    df.groupby(['primary_ingredient', 'primary_strength'])['manufacturer']
    .nunique()
    .reset_index(name='company_count')
)

df = df.merge(
    company_count,
    on=['primary_ingredient', 'primary_strength'],
    how='left'
)

# Regulatory complexity proxy based on Indian GMP & CDSCO practices
regulatory_map = {
    # Low regulatory burden
    'tablet': 1,
    'capsule': 1,
    'powder': 1,

    # Medium regulatory burden
    'cream': 2,
    'gel': 2,
    'ointment': 2,
    'syrup': 2,
    'suspension': 2,
    'solution': 2,
    'drops': 2,
    'spray': 2,
    'other': 2,

    # High regulatory burden
    'injection': 3,
    'inhaler': 3,
    'respules': 3
}

df['regulatory_complexity'] = df['dosage_form'].map(regulatory_map)


# SELECT FEATURES FOR CLUSTERING
cluster_features = [
    'price_inr',               # Pricing / margin proxy
    'num_active_ingredients',  # Formulation complexity
    'company_count',           # Competition intensity
    'regulatory_complexity'    # Regulatory burden proxy
]

cluster_df = df[cluster_features].dropna()


# FEATURE SCALING
scaler = StandardScaler()
scaled_features = scaler.fit_transform(cluster_df)

# K-MEANS CLUSTERING
kmeans = KMeans(
    n_clusters=4,        # Business-driven segmentation
    random_state=42,
    n_init=10
)

cluster_labels = kmeans.fit_predict(scaled_features)

cluster_df['market_cluster'] = cluster_labels

# Attach cluster labels back to main dataframe
df.loc[cluster_df.index, 'market_cluster'] = cluster_labels


# FORMAL MARKET SEGMENT NAMING
cluster_name_map = {
    0: 'Commodity Generic Segment',
    1: 'Highly Competitive Combination Segment',
    2: 'Regulatory-Intensive Segment',
    3: 'Premium Niche Segment'
}

df['market_segment'] = df['market_cluster'].map(cluster_name_map)

segment_summary = (
    df.groupby('market_segment')
    .agg(
        avg_price=('price_inr', 'mean'),
        avg_ingredients=('num_active_ingredients', 'mean'),
        avg_competition=('company_count', 'mean'),
        avg_regulatory_complexity=('regulatory_complexity', 'mean'),
        product_count=('price_inr', 'count')
    )
    .sort_values('avg_price', ascending=False)
)

print(segment_summary)

                                            avg_price  avg_ingredients  \
market_segment                                                           
Premium Niche Segment                   174905.392558         1.023256   
Highly Competitive Combination Segment     466.222590         1.418350   
Regulatory-Intensive Segment               196.960099         1.000000   
Commodity Generic Segment                  121.239145         1.999911   

                                        avg_competition  \
market_segment                                            
Premium Niche Segment                          6.279070   
Highly Competitive Combination Segment       445.017986   
Regulatory-Intensive Segment                 648.655275   
Commodity Generic Segment                   1358.081370   

                                        avg_regulatory_complexity  \
market_segment                                                      
Premium Niche Segment                                    1.906

### Check the brands that are not in any Segment


In [2]:
df['market_segment'].isnull().sum()

25187

In [3]:
df['missing_reason'] = np.select(
    [
        df['company_count'].isna(),
        df['regulatory_complexity'].isna()
    ],
    [
        'Missing competition data',
        'Missing regulatory mapping'
    ],
    default='Complete'
)

df['missing_reason'].value_counts()


missing_reason
Complete                    228775
Missing competition data     25187
Name: count, dtype: int64

##### **Missing company_count values arise because primary_strength is null for non-dose-based formulations such as creams, gels, ointments, and powders, where strength is not clinically meaningful or consistently reported.**

In [6]:
df.to_csv("C:/Users/cw/Downloads/Pharmaceutical/indian_pharmaceutical_products_segmented.csv", index=False)